summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/dir.c3
-rw-r--r--fs/afs/vl_probe.c4
-rw-r--r--fs/afs/write.c7
-rw-r--r--fs/aio.c26
-rw-r--r--fs/autofs/root.c6
-rw-r--r--fs/btrfs/block-group.c2
-rw-r--r--fs/btrfs/disk-io.c19
-rw-r--r--fs/btrfs/inode.c20
-rw-r--r--fs/btrfs/scrub.c50
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/tree-checker.c4
-rw-r--r--fs/btrfs/volumes.c31
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/buffer.c1
-rw-r--r--fs/cachefiles/namei.c11
-rw-r--r--fs/ceph/caps.c6
-rw-r--r--fs/ceph/snap.c4
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/crypto/fscrypt_private.h2
-rw-r--r--fs/crypto/hooks.c10
-rw-r--r--fs/d_path.c1
-rw-r--r--fs/erofs/compress.h3
-rw-r--r--fs/erofs/decompressor.c8
-rw-r--r--fs/erofs/internal.h41
-rw-r--r--fs/erofs/super.c69
-rw-r--r--fs/erofs/utils.c86
-rw-r--r--fs/erofs/xattr.c670
-rw-r--r--fs/erofs/zdata.c269
-rw-r--r--fs/erofs/zmap.c75
-rw-r--r--fs/eventfd.c12
-rw-r--r--fs/eventpoll.c8
-rw-r--r--fs/ext4/balloc.c25
-rw-r--r--fs/ext4/namei.c17
-rw-r--r--fs/ext4/super.c6
-rw-r--r--fs/ext4/xattr.c6
-rw-r--r--fs/f2fs/namei.c16
-rw-r--r--fs/file_table.c91
-rw-r--r--fs/fs_context.c3
-rw-r--r--fs/gfs2/file.c17
-rw-r--r--fs/inode.c62
-rw-r--r--fs/internal.h48
-rw-r--r--fs/jffs2/build.c5
-rw-r--r--fs/jffs2/xattr.c13
-rw-r--r--fs/jffs2/xattr.h4
-rw-r--r--fs/jfs/namei.c6
-rw-r--r--fs/lockd/svc.c1
-rw-r--r--fs/namei.c50
-rw-r--r--fs/namespace.c476
-rw-r--r--fs/nfsd/cache.h2
-rw-r--r--fs/nfsd/export.c12
-rw-r--r--fs/nfsd/nfs3proc.c14
-rw-r--r--fs/nfsd/nfs3xdr.c11
-rw-r--r--fs/nfsd/nfs4xdr.c289
-rw-r--r--fs/nfsd/nfscache.c25
-rw-r--r--fs/nfsd/nfsctl.c116
-rw-r--r--fs/nfsd/nfsfh.c26
-rw-r--r--fs/nfsd/nfsproc.c14
-rw-r--r--fs/nfsd/nfssvc.c5
-rw-r--r--fs/nfsd/nfsxdr.c11
-rw-r--r--fs/nfsd/trace.h259
-rw-r--r--fs/nfsd/vfs.c80
-rw-r--r--fs/nfsd/vfs.h9
-rw-r--r--fs/nilfs2/btnode.c12
-rw-r--r--fs/nilfs2/page.c10
-rw-r--r--fs/nilfs2/segbuf.c6
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/sufile.c9
-rw-r--r--fs/nilfs2/super.c23
-rw-r--r--fs/nilfs2/the_nilfs.c43
-rw-r--r--fs/ntfs/attrib.c2
-rw-r--r--fs/ntfs/compress.c2
-rw-r--r--fs/ntfs/mft.c36
-rw-r--r--fs/ntfs/super.c4
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/open.c90
-rw-r--r--fs/overlayfs/file.c8
-rw-r--r--fs/overlayfs/overlayfs.h5
-rw-r--r--fs/pnode.c42
-rw-r--r--fs/pnode.h3
-rw-r--r--fs/readdir.c8
-rw-r--r--fs/remap_range.c5
-rw-r--r--fs/smb/client/cifs_debug.c58
-rw-r--r--fs/smb/client/cifsglob.h37
-rw-r--r--fs/smb/client/cifsproto.h1
-rw-r--r--fs/smb/client/connect.c59
-rw-r--r--fs/smb/client/dfs.c9
-rw-r--r--fs/smb/client/file.c8
-rw-r--r--fs/smb/client/smb2ops.c40
-rw-r--r--fs/smb/client/smb2pdu.c32
-rw-r--r--fs/smb/client/transport.c2
-rw-r--r--fs/smb/server/connection.c17
-rw-r--r--fs/smb/server/oplock.c66
-rw-r--r--fs/smb/server/server.c33
-rw-r--r--fs/smb/server/smb2misc.c33
-rw-r--r--fs/smb/server/smb2pdu.c83
-rw-r--r--fs/smb/server/smb_common.c14
-rw-r--r--fs/smb/server/smbacl.c14
-rw-r--r--fs/smb/server/vfs.c121
-rw-r--r--fs/smb/server/vfs.h17
-rw-r--r--fs/smb/server/vfs_cache.c2
-rw-r--r--fs/super.c24
-rw-r--r--fs/sysv/dir.c22
-rw-r--r--fs/sysv/itree.c4
-rw-r--r--fs/sysv/namei.c8
-rw-r--r--fs/udf/namei.c14
-rw-r--r--fs/userfaultfd.c13
-rw-r--r--fs/verity/Kconfig16
-rw-r--r--fs/verity/enable.c21
-rw-r--r--fs/verity/fsverity_private.h23
-rw-r--r--fs/verity/hash_algs.c139
-rw-r--r--fs/verity/measure.c37
-rw-r--r--fs/verity/open.c12
-rw-r--r--fs/verity/read_metadata.c4
-rw-r--r--fs/verity/signature.c8
-rw-r--r--fs/verity/verify.c164
-rw-r--r--fs/xfs/libxfs/xfs_ag.c5
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c91
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c10
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c7
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c24
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h9
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c13
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c113
-rw-r--r--fs/xfs/scrub/bmap.c25
-rw-r--r--fs/xfs/scrub/scrub.h8
-rw-r--r--fs/xfs/xfs_buf_item.c88
-rw-r--r--fs/xfs/xfs_filestream.c1
-rw-r--r--fs/xfs/xfs_icache.c46
-rw-r--r--fs/xfs/xfs_icache.h4
-rw-r--r--fs/xfs/xfs_inode.c20
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c149
-rw-r--r--fs/xfs/xfs_inode_item.h1
-rw-r--r--fs/xfs/xfs_log_recover.c19
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_reflink.c4
-rw-r--r--fs/xfs/xfs_super.c1
-rw-r--r--fs/xfs/xfs_trans.c9
141 files changed, 3166 insertions, 2162 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 4dd97afa536c..5219182e52e1 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1358,6 +1358,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
op->dentry = dentry;
op->create.mode = S_IFDIR | mode;
op->create.reason = afs_edit_dir_for_mkdir;
+ op->mtime = current_time(dir);
op->ops = &afs_mkdir_operation;
return afs_do_sync_operation(op);
}
@@ -1661,6 +1662,7 @@ static int afs_create(struct mnt_idmap *idmap, struct inode *dir,
op->dentry = dentry;
op->create.mode = S_IFREG | mode;
op->create.reason = afs_edit_dir_for_create;
+ op->mtime = current_time(dir);
op->ops = &afs_create_operation;
return afs_do_sync_operation(op);
@@ -1796,6 +1798,7 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
op->ops = &afs_symlink_operation;
op->create.reason = afs_edit_dir_for_symlink;
op->create.symlink = content;
+ op->mtime = current_time(dir);
return afs_do_sync_operation(op);
error:
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index d1c7068b4346..58452b86e672 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -115,8 +115,8 @@ responded:
}
}
- if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
- rtt_us < server->probe.rtt) {
+ rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
+ if (rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c822d6006033..8750b99c3f56 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -731,6 +731,7 @@ static int afs_writepages_region(struct address_space *mapping,
* (changing page->mapping to NULL), or even swizzled
* back from swapper_space to tmpfs file mapping
*/
+try_again:
if (wbc->sync_mode != WB_SYNC_NONE) {
ret = folio_lock_killable(folio);
if (ret < 0) {
@@ -757,12 +758,14 @@ static int afs_writepages_region(struct address_space *mapping,
#ifdef CONFIG_AFS_FSCACHE
folio_wait_fscache(folio);
#endif
- } else {
- start += folio_size(folio);
+ goto try_again;
}
+
+ start += folio_size(folio);
if (wbc->sync_mode == WB_SYNC_NONE) {
if (skips >= 5 || need_resched()) {
*_next = start;
+ folio_batch_release(&fbatch);
_leave(" = 0 [%llx]", *_next);
return 0;
}
diff --git a/fs/aio.c b/fs/aio.c
index b0b17bd098bb..77e33619de40 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -530,7 +530,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
for (i = 0; i < nr_pages; i++) {
struct page *page;
page = find_or_create_page(file->f_mapping,
- i, GFP_HIGHUSER | __GFP_ZERO);
+ i, GFP_USER | __GFP_ZERO);
if (!page)
break;
pr_debug("pid(%d) page[%d]->count=%d\n",
@@ -571,7 +571,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->user_id = ctx->mmap_base;
ctx->nr_events = nr_events; /* trusted copy */
- ring = kmap_atomic(ctx->ring_pages[0]);
+ ring = page_address(ctx->ring_pages[0]);
ring->nr = nr_events; /* user copy */
ring->id = ~0U;
ring->head = ring->tail = 0;
@@ -579,7 +579,6 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ring->compat_features = AIO_RING_COMPAT_FEATURES;
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
- kunmap_atomic(ring);
flush_dcache_page(ctx->ring_pages[0]);
return 0;
@@ -682,9 +681,8 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
* we are protected from page migration
* changes ring_pages by ->ring_lock.
*/
- ring = kmap_atomic(ctx->ring_pages[0]);
+ ring = page_address(ctx->ring_pages[0]);
ring->id = ctx->id;
- kunmap_atomic(ring);
return 0;
}
@@ -1025,9 +1023,8 @@ static void user_refill_reqs_available(struct kioctx *ctx)
* against ctx->completed_events below will make sure we do the
* safe/right thing.
*/
- ring = kmap_atomic(ctx->ring_pages[0]);
+ ring = page_address(ctx->ring_pages[0]);
head = ring->head;
- kunmap_atomic(ring);
refill_reqs_available(ctx, head, ctx->tail);
}
@@ -1133,12 +1130,11 @@ static void aio_complete(struct aio_kiocb *iocb)
if (++tail >= ctx->nr_events)
tail = 0;
- ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ ev_page = page_address(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
*event = iocb->ki_res;
- kunmap_atomic(ev_page);
flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
@@ -1152,10 +1148,9 @@ static void aio_complete(struct aio_kiocb *iocb)
ctx->tail = tail;
- ring = kmap_atomic(ctx->ring_pages[0]);
+ ring = page_address(ctx->ring_pages[0]);
head = ring->head;
ring->tail = tail;
- kunmap_atomic(ring);
flush_dcache_page(ctx->ring_pages[0]);
ctx->completed_events++;
@@ -1215,10 +1210,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
mutex_lock(&ctx->ring_lock);
/* Access to ->ring_pages here is protected by ctx->ring_lock. */
- ring = kmap_atomic(ctx->ring_pages[0]);
+ ring = page_address(ctx->ring_pages[0]);
head = ring->head;
tail = ring->tail;
- kunmap_atomic(ring);
/*
* Ensure that once we've read the current tail pointer, that
@@ -1250,10 +1244,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
avail = min(avail, nr - ret);
avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
- ev = kmap(page);
+ ev = page_address(page);
copy_ret = copy_to_user(event + ret, ev + pos,
sizeof(*ev) * avail);
- kunmap(page);
if (unlikely(copy_ret)) {
ret = -EFAULT;
@@ -1265,9 +1258,8 @@ static long aio_read_events_ring(struct kioctx *ctx,
head %= ctx->nr_events;
}
- ring = kmap_atomic(ctx->ring_pages[0]);
+ ring = page_address(ctx->ring_pages[0]);
ring->head = head;
- kunmap_atomic(ring);
flush_dcache_page(ctx->ring_pages[0]);
pr_debug("%li h%u t%u\n", ret, head, tail);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 6baf90b08e0e..93046c9dc461 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
- dir->i_mtime = current_time(dir);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
return 0;
}
@@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
d_inode(dentry)->i_size = 0;
clear_nlink(d_inode(dentry));
- dir->i_mtime = current_time(dir);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
spin_lock(&sbi->lookup_lock);
__autofs_add_expiring(dentry);
@@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
inc_nlink(dir);
- dir->i_mtime = current_time(dir);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
return 0;
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 590b03560265..e97af2e510c3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1973,7 +1973,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
+ io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
if (!buf) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b1b227505f3..dabc79c1af1b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -242,7 +242,6 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- u64 start = eb->start;
int i, num_pages = num_extent_pages(eb);
int ret = 0;
@@ -251,12 +250,14 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
+ u64 start = max_t(u64, eb->start, page_offset(p));
+ u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
+ u32 len = end - start;
- ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE,
- start, p, start - page_offset(p), mirror_num);
+ ret = btrfs_repair_io_failure(fs_info, 0, start, len,
+ start, p, offset_in_page(start), mirror_num);
if (ret)
break;
- start += PAGE_SIZE;
}
return ret;
@@ -995,13 +996,18 @@ int btrfs_global_root_insert(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *tmp;
+ int ret = 0;
write_lock(&fs_info->global_root_lock);
tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
write_unlock(&fs_info->global_root_lock);
- ASSERT(!tmp);
- return tmp ? -EEXIST : 0;
+ if (tmp) {
+ ret = -EEXIST;
+ btrfs_warn(fs_info, "global root %llu %llu already exists",
+ root->root_key.objectid, root->root_key.offset);
+ }
+ return ret;
}
void btrfs_global_root_delete(struct btrfs_root *root)
@@ -2841,6 +2847,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
/* We can't trust the free space cache either */
btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+ btrfs_warn(fs_info, "try to load backup roots slot %d", i);
ret = read_backup_root(fs_info, i);
backup_index = ret;
if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 19c707bc8801..7fcafcc5292c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1864,7 +1864,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
key->offset - args->extent_offset,
- args->disk_bytenr, false, path);
+ args->disk_bytenr, args->strict, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -7264,7 +7264,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct inode *inode,
struct btrfs_dio_data *dio_data,
- u64 start, u64 len,
+ u64 start, u64 *lenp,
unsigned int iomap_flags)
{
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
@@ -7275,6 +7275,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct btrfs_block_group *bg;
bool can_nocow = false;
bool space_reserved = false;
+ u64 len = *lenp;
u64 prev_len;
int ret = 0;
@@ -7345,15 +7346,19 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
free_extent_map(em);
*map = NULL;
- if (nowait)
- return -EAGAIN;
+ if (nowait) {
+ ret = -EAGAIN;
+ goto out;
+ }
/*
* If we could not allocate data space before locking the file
* range and we can't do a NOCOW write, then we have to fail.
*/
- if (!dio_data->data_space_reserved)
- return -ENOSPC;
+ if (!dio_data->data_space_reserved) {
+ ret = -ENOSPC;
+ goto out;
+ }
/*
* We have to COW and we have already reserved data space before,
@@ -7394,6 +7399,7 @@ out:
btrfs_delalloc_release_extents(BTRFS_I(inode), len);
btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
}
+ *lenp = len;
return ret;
}
@@ -7570,7 +7576,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
- start, len, flags);
+ start, &len, flags);
if (ret < 0)
goto unlock_err;
unlock_extents = true;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 7c666517d3d3..16c228344cbb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -134,8 +134,14 @@ struct scrub_stripe {
* The errors hit during the initial read of the stripe.
*
* Would be utilized for error reporting and repair.
+ *
+ * The remaining init_nr_* records the number of errors hit, only used
+ * by error reporting.
*/
unsigned long init_error_bitmap;
+ unsigned int init_nr_io_errors;
+ unsigned int init_nr_csum_errors;
+ unsigned int init_nr_meta_errors;
/*
* The following error bitmaps are all for the current status.
@@ -1003,12 +1009,9 @@ skip:
sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
sctx->stat.no_csum += nr_nodatacsum_sectors;
- sctx->stat.read_errors +=
- bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
- sctx->stat.csum_errors +=
- bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
- sctx->stat.verify_errors +=
- bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
+ sctx->stat.read_errors += stripe->init_nr_io_errors;
+ sctx->stat.csum_errors += stripe->init_nr_csum_errors;
+ sctx->stat.verify_errors += stripe->init_nr_meta_errors;
sctx->stat.uncorrectable_errors +=
bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
sctx->stat.corrected_errors += nr_repaired_sectors;
@@ -1041,6 +1044,12 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
/* Save the initial failed bitmap for later repair and report usage. */
stripe->init_error_bitmap = stripe->error_bitmap;
+ stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap,
+ stripe->nr_sectors);
+ stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap,
+ stripe->nr_sectors);
+ stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
+ stripe->nr_sectors);
if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
goto out;
@@ -1295,7 +1304,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
u32 stripe_index;
u32 rot;
- *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
+ *offset = last_offset + btrfs_stripe_nr_to_offset(i);
stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
@@ -1310,7 +1319,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
if (stripe_index < num)
j++;
}
- *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
+ *offset = last_offset + btrfs_stripe_nr_to_offset(j);
return 1;
}
@@ -1490,6 +1499,9 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
{
stripe->extent_sector_bitmap = 0;
stripe->init_error_bitmap = 0;
+ stripe->init_nr_io_errors = 0;
+ stripe->init_nr_csum_errors = 0;
+ stripe->init_nr_meta_errors = 0;
stripe->error_bitmap = 0;
stripe->io_error_bitmap = 0;
stripe->csum_error_bitmap = 0;
@@ -1703,7 +1715,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
- nr_stripes << BTRFS_STRIPE_LEN_SHIFT);
+ btrfs_stripe_nr_to_offset(nr_stripes));
for (int i = 0; i < nr_stripes; i++) {
stripe = &sctx->stripes[i];
scrub_submit_initial_read(sctx, stripe);
@@ -1730,7 +1742,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
break;
}
}
- } else {
+ } else if (!sctx->readonly) {
for (int i = 0; i < nr_stripes; i++) {
unsigned long repaired;
@@ -1826,7 +1838,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
bool all_empty = true;
const int data_stripes = nr_data_stripes(map);
unsigned long extent_bitmap = 0;
- u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT;
+ u64 length = btrfs_stripe_nr_to_offset(data_stripes);
int ret;
ASSERT(sctx->raid56_data_stripes);
@@ -1841,13 +1853,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
stripe_index = (i + rot) % map->num_stripes;
physical = map->stripes[stripe_index].physical +
- (rot << BTRFS_STRIPE_LEN_SHIFT);
+ btrfs_stripe_nr_to_offset(rot);
scrub_reset_stripe(stripe);
set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
ret = scrub_find_fill_first_stripe(bg,
map->stripes[stripe_index].dev, physical, 1,
- full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT),
+ full_stripe_start + btrfs_stripe_nr_to_offset(i),
BTRFS_STRIPE_LEN, stripe);
if (ret < 0)
goto out;
@@ -1857,7 +1869,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
*/
if (ret > 0) {
stripe->logical = full_stripe_start +
- (i << BTRFS_STRIPE_LEN_SHIFT);
+ btrfs_stripe_nr_to_offset(i);
stripe->dev = map->stripes[stripe_index].dev;
stripe->mirror_num = 1;
set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
@@ -2050,7 +2062,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
- return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
+ return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes);
}
/* Get the logical bytenr for the stripe */
@@ -2066,7 +2078,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
* (stripe_index / sub_stripes) gives how many data stripes we need to
* skip.
*/
- return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
+ return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) +
bg->start;
}
@@ -2192,7 +2204,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
- offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
+ offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes);
goto out;
}
@@ -2207,7 +2219,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/* Initialize @offset in case we need to go to out: label */
get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
- increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
+ increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
/*
* Due to the rotation, for RAID56 it's better to iterate each stripe
@@ -2254,7 +2266,7 @@ next:
}
out:
ret2 = flush_scrub_stripes(sctx);
- if (!ret2)
+ if (!ret)
ret = ret2;
if (sctx->raid56_data_stripes) {
for (int i = 0; i < nr_data_stripes(map); i++)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ec18e2210602..efeb1a9d040a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1841,6 +1841,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_clear_sb_rdonly(sb);
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+
+ /*
+ * If we've gone from readonly -> read/write, we need to get
+ * our sync/async discard lists in the right state.
+ */
+ btrfs_discard_resume(fs_info);
}
out:
/*
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index e2b54793bf0c..2138e9fc0564 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -857,10 +857,10 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
*
* Thus it should be a good way to catch obvious bitflips.
*/
- if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) {
+ if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) {
chunk_err(leaf, chunk, logical,
"chunk length too large: have %llu limit %llu",
- length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT);
+ length, btrfs_stripe_nr_to_offset(U32_MAX));
return -EUCLEAN;
}
if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 841e799dece5..72a838c97534 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5125,7 +5125,7 @@ static void init_alloc_chunk_ctl_policy_regular(
/* We don't want a chunk larger than 10% of writable space */
ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
ctl->max_chunk_size);
- ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT;
+ ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
}
static void init_alloc_chunk_ctl_policy_zoned(
@@ -5801,7 +5801,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
if (!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
+ len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
free_extent_map(em);
}
return len;
@@ -5975,12 +5975,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
/* stripe_offset is the offset of this block in its stripe */
- stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
+ stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
BTRFS_STRIPE_LEN_SHIFT;
stripe_cnt = stripe_nr_end - stripe_nr;
- stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) -
+ stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
(offset + length);
/*
* after this, stripe_nr is the number of stripes on this
@@ -6023,12 +6023,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
for (i = 0; i < *num_stripes; i++) {
stripes[i].physical =
map->stripes[stripe_index].physical +
- stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
+ stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT;
+ stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);
if (i / sub_stripes < remaining_stripes)
stripes[i].length += BTRFS_STRIPE_LEN;
@@ -6183,8 +6183,8 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
ASSERT(*stripe_offset < U32_MAX);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- unsigned long full_stripe_len = nr_data_stripes(map) <<
- BTRFS_STRIPE_LEN_SHIFT;
+ unsigned long full_stripe_len =
+ btrfs_stripe_nr_to_offset(nr_data_stripes(map));
/*
* For full stripe start, we use previously calculated
@@ -6196,9 +6196,11 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* not ensured to be power of 2.
*/
*full_stripe_start =
- rounddown(*stripe_nr, nr_data_stripes(map)) <<
- BTRFS_STRIPE_LEN_SHIFT;
+ btrfs_stripe_nr_to_offset(
+ rounddown(*stripe_nr, nr_data_stripes(map)));
+ ASSERT(*full_stripe_start + full_stripe_len > offset);
+ ASSERT(*full_stripe_start <= offset);
/*
* For writes to RAID56, allow to write a full stripe set, but
* no straddling of stripe sets.
@@ -6221,7 +6223,7 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
{
dst->dev = map->stripes[stripe_index].dev;
dst->physical = map->stripes[stripe_index].physical +
- stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
+ stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
}
int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
@@ -6343,7 +6345,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
/* Return the length to the full stripe end */
*length = min(logical + *length,
raid56_full_stripe_start + em->start +
- (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical;
+ btrfs_stripe_nr_to_offset(data_stripes)) -
+ logical;
stripe_index = 0;
stripe_offset = 0;
} else {
@@ -6433,7 +6436,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* modulo, to reduce one modulo call.
*/
bioc->full_stripe_logical = em->start +
- ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT);
+ btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
for (i = 0; i < num_stripes; i++)
set_io_stripe(&bioc->stripes[i], map,
(i + stripe_nr) % num_stripes,
@@ -8030,7 +8033,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc,
for (i = 0; i < data_stripes; i++) {
u64 stripe_start = bioc->full_stripe_logical +
- (i << BTRFS_STRIPE_LEN_SHIFT);
+ btrfs_stripe_nr_to_offset(i);
if (logical >= stripe_start &&
logical < stripe_start + BTRFS_STRIPE_LEN)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bf47a1a70813..64066d48dce1 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -574,6 +574,17 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
sizeof(struct btrfs_stripe) * (num_stripes - 1);
}
+/*
+ * Do the type safe converstion from stripe_nr to offset inside the chunk.
+ *
+ * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger
+ * than 4G. This does the proper type cast to avoid overflow.
+ */
+static inline u64 btrfs_stripe_nr_to_offset(u32 stripe_nr)
+{
+ return (u64)stripe_nr << BTRFS_STRIPE_LEN_SHIFT;
+}
+
void btrfs_get_bioc(struct btrfs_io_context *bioc);
void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
diff --git a/fs/buffer.c b/fs/buffer.c
index a7fc561758b1..fe64356e89b8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -111,7 +111,6 @@ void buffer_check_dirty_writeback(struct folio *folio,
bh = bh->b_this_page;
} while (bh != head);
}
-EXPORT_SYMBOL(buffer_check_dirty_writeback);
/*
* Block until a buffer comes unlocked. This doesn't stop it
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 82219a8f6084..d9d22d0ec38a 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -451,9 +451,10 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
ret = cachefiles_inject_write_error();
if (ret == 0) {
- file = vfs_tmpfile_open(&nop_mnt_idmap, &parentpath, S_IFREG,
- O_RDWR | O_LARGEFILE | O_DIRECT,
- cache->cache_cred);
+ file = kernel_tmpfile_open(&nop_mnt_idmap, &parentpath,
+ S_IFREG | 0600,
+ O_RDWR | O_LARGEFILE | O_DIRECT,
+ cache->cache_cred);
ret = PTR_ERR_OR_ZERO(file);
}
if (ret) {
@@ -560,8 +561,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
*/
path.mnt = cache->mnt;
path.dentry = dentry;
- file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
- d_backing_inode(dentry), cache->cache_cred);
+ file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
+ d_backing_inode(dentry), cache->cache_cred);
if (IS_ERR(file)) {
trace_cachefiles_vfs_error(object, d_backing_inode(dentry),
PTR_ERR(file),
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 789be30d6ee2..2321e5ddb664 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1627,6 +1627,7 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL;
+ bool need_put = false;
int mds;
dout("ceph_flush_snaps %p\n", inode);
@@ -1671,8 +1672,13 @@ out:
ceph_put_mds_session(session);
/* we flushed them all; remove this inode from the queue */
spin_lock(&mdsc->snap_flush_lock);
+ if (!list_empty(&ci->i_snap_flush_item))
+ need_put = true;
list_del_init(&ci->i_snap_flush_item);
spin_unlock(&mdsc->snap_flush_lock);
+
+ if (need_put)
+ iput(inode);
}
/*
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 0b236ebd989f..2e73ba62bd7a 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -693,8 +693,10 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
- if (list_empty(&ci->i_snap_flush_item))
+ if (list_empty(&ci->i_snap_flush_item)) {
+ ihold(inode);
list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ }
spin_unlock(&mdsc->snap_flush_lock);
return 1; /* caller may want to ceph_flush_snaps */
}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 13deb45f1ec6..950b6919fb87 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -150,7 +150,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
cd->major = major;
cd->baseminor = baseminor;
cd->minorct = minorct;
- strlcpy(cd->name, name, sizeof(cd->name));
+ strscpy(cd->name, name, sizeof(cd->name));
if (!prev) {
cd->next = curr;
diff --git a/fs/coredump.c b/fs/coredump.c
index 88740c51b942..9d235fa14ab9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -648,7 +648,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
} else {
struct mnt_idmap *idmap;
struct inode *inode;
- int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
+ int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW |
O_LARGEFILE | O_EXCL;
if (cprm.limit < binfmt->min_coredump)
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7ab5a7b7eef8..2d63da48635a 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -171,7 +171,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy)
*/
struct fscrypt_symlink_data {
__le16 len;
- char encrypted_path[1];
+ char encrypted_path[];
} __packed;
/**
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 9e786ae66a13..6238dbcadcad 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -255,10 +255,10 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target,
* for now since filesystems will assume it is there and subtract it.
*/
if (!__fscrypt_fname_encrypted_size(policy, len,
- max_len - sizeof(struct fscrypt_symlink_data),
+ max_len - sizeof(struct fscrypt_symlink_data) - 1,
&disk_link->len))
return -ENAMETOOLONG;
- disk_link->len += sizeof(struct fscrypt_symlink_data);
+ disk_link->len += sizeof(struct fscrypt_symlink_data) + 1;
disk_link->name = NULL;
return 0;
@@ -289,7 +289,7 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
if (!sd)
return -ENOMEM;
}
- ciphertext_len = disk_link->len - sizeof(*sd);
+ ciphertext_len = disk_link->len - sizeof(*sd) - 1;
sd->len = cpu_to_le16(ciphertext_len);
err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path,
@@ -367,7 +367,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
* the ciphertext length, even though this is redundant with i_size.
*/
- if (max_size < sizeof(*sd))
+ if (max_size < sizeof(*sd) + 1)
return ERR_PTR(-EUCLEAN);
sd = caddr;
cstr.name = (unsigned char *)sd->encrypted_path;
@@ -376,7 +376,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
if (cstr.len == 0)
return ERR_PTR(-EUCLEAN);
- if (cstr.len + sizeof(*sd) - 1 > max_size)
+ if (cstr.len + sizeof(*sd) > max_size)
return ERR_PTR(-EUCLEAN);
err = fscrypt_fname_alloc_buffer(cstr.len, &pstr);
diff --git a/fs/d_path.c b/fs/d_path.c
index 56a6ee4c6331..5f4da5c8d5db 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/prefetch.h>
#include "mount.h"
+#include "internal.h"
struct prepend_buffer {
char *buf;
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 26fa170090b8..b1b846504027 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -89,8 +89,7 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
unsigned int padbufsize);
-int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
+extern const struct z_erofs_decompressor erofs_decompressors[];
/* prototypes for specific algorithms */
int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 7021e2cf6146..2a29943fa5cc 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -363,7 +363,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
return 0;
}
-static struct z_erofs_decompressor decompressors[] = {
+const struct z_erofs_decompressor erofs_decompressors[] = {
[Z_EROFS_COMPRESSION_SHIFTED] = {
.decompress = z_erofs_transform_plain,
.name = "shifted"
@@ -383,9 +383,3 @@ static struct z_erofs_decompressor decompressors[] = {
},
#endif
};
-
-int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
-{
- return decompressors[rq->alg].decompress(rq, pagepool);
-}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1e39c03357d1..36e32fa542f0 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -208,46 +208,12 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
-
/* basic unit of the workstation of a super_block */
struct erofs_workgroup {
- /* the workgroup index in the workstation */
pgoff_t index;
-
- /* overall workgroup reference count */
- atomic_t refcount;
+ struct lockref lockref;
};
-static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
- int val)
-{
- preempt_disable();
- if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
- preempt_enable();
- return false;
- }
- return true;
-}
-
-static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
- int orig_val)
-{
- /*
- * other observers should notice all modifications
- * in the freezing period.
- */
- smp_mb();
- atomic_set(&grp->refcount, orig_val);
- preempt_enable();
-}
-
-static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
-{
- return atomic_cond_read_relaxed(&grp->refcount,
- VAL != EROFS_LOCKED_MAGIC);
-}
-
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
@@ -486,7 +452,7 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-int erofs_workgroup_put(struct erofs_workgroup *grp);
+void erofs_workgroup_put(struct erofs_workgroup *grp);
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
pgoff_t index);
struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
@@ -500,7 +466,6 @@ int __init z_erofs_init_zip_subsystem(void);
void z_erofs_exit_zip_subsystem(void);
int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
-int erofs_try_to_free_cached_page(struct page *page);
int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_lz4_cfgs *lz4, int len);
@@ -511,6 +476,7 @@ void erofs_put_pcpubuf(void *ptr);
int erofs_pcpubuf_growsize(unsigned int nrpages);
void __init erofs_pcpubuf_init(void);
void erofs_pcpubuf_exit(void);
+int erofs_init_managed_cache(struct super_block *sb);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -530,6 +496,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
}
static inline void erofs_pcpubuf_init(void) {}
static inline void erofs_pcpubuf_exit(void) {}
+static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
#ifdef CONFIG_EROFS_FS_ZIP_LZMA
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 811ab66d805e..ff18f6b14de5 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -599,68 +599,6 @@ static int erofs_fc_parse_param(struct fs_context *fc,
return 0;
}
-#ifdef CONFIG_EROFS_FS_ZIP
-static const struct address_space_operations managed_cache_aops;
-
-static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp)
-{
- bool ret = true;
- struct address_space *const mapping = folio->mapping;
-
- DBG_BUGON(!folio_test_locked(folio));
- DBG_BUGON(mapping->a_ops != &managed_cache_aops);
-
- if (folio_test_private(folio))
- ret = erofs_try_to_free_cached_page(&folio->page);
-
- return ret;
-}
-
-/*
- * It will be called only on inode eviction. In case that there are still some
- * decompression requests in progress, wait with rescheduling for a bit here.
- * We could introduce an extra locking instead but it seems unnecessary.
- */
-static void erofs_managed_cache_invalidate_folio(struct folio *folio,
- size_t offset, size_t length)
-{
- const size_t stop = length + offset;
-
- DBG_BUGON(!folio_test_locked(folio));
-
- /* Check for potential overflow in debug mode */
- DBG_BUGON(stop > folio_size(folio) || stop < length);
-
- if (offset == 0 && stop == folio_size(folio))
- while (!erofs_managed_cache_release_folio(folio, GFP_NOFS))
- cond_resched();
-}
-
-static const struct address_space_operations managed_cache_aops = {
- .release_folio = erofs_managed_cache_release_folio,
- .invalidate_folio = erofs_managed_cache_invalidate_folio,
-};
-
-static int erofs_init_managed_cache(struct super_block *sb)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- struct inode *const inode = new_inode(sb);
-
- if (!inode)
- return -ENOMEM;
-
- set_nlink(inode, 1);
- inode->i_size = OFFSET_MAX;
-
- inode->i_mapping->a_ops = &managed_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- sbi->managed_cache = inode;
- return 0;
-}
-#else
-static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
-#endif
-
static struct inode *erofs_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
{
@@ -1016,10 +954,8 @@ static int __init erofs_module_init(void)
sizeof(struct erofs_inode), 0,
SLAB_RECLAIM_ACCOUNT,
erofs_inode_init_once);
- if (!erofs_inode_cachep) {
- err = -ENOMEM;
- goto icache_err;
- }
+ if (!erofs_inode_cachep)
+ return -ENOMEM;
err = erofs_init_shrinker();
if (err)
@@ -1054,7 +990,6 @@ lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
-icache_err:
return err;
}
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 46627cb69abe..cc6fb9e98899 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -4,7 +4,6 @@
* https://www.huawei.com/
*/
#include "internal.h"
-#include <linux/pagevec.h>
struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
{
@@ -33,22 +32,21 @@ void erofs_release_pages(struct page **pagepool)
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
-static int erofs_workgroup_get(struct erofs_workgroup *grp)
+static bool erofs_workgroup_get(struct erofs_workgroup *grp)
{
- int o;
+ if (lockref_get_not_zero(&grp->lockref))
+ return true;
-repeat:
- o = erofs_wait_on_workgroup_freezed(grp);
- if (o <= 0)
- return -1;
-
- if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
- goto repeat;
+ spin_lock(&grp->lockref.lock);
+ if (__lockref_is_dead(&grp->lockref)) {
+ spin_unlock(&grp->lockref.lock);
+ return false;
+ }
- /* decrease refcount paired by erofs_workgroup_put */
- if (o == 1)
+ if (!grp->lockref.count++)
atomic_long_dec(&erofs_global_shrink_cnt);
- return 0;
+ spin_unlock(&grp->lockref.lock);
+ return true;
}
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
@@ -61,7 +59,7 @@ repeat:
rcu_read_lock();
grp = xa_load(&sbi->managed_pslots, index);
if (grp) {
- if (erofs_workgroup_get(grp)) {
+ if (!erofs_workgroup_get(grp)) {
/* prefer to relax rcu read side */
rcu_read_unlock();
goto repeat;
@@ -80,11 +78,10 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
struct erofs_workgroup *pre;
/*
- * Bump up a reference count before making this visible
- * to others for the XArray in order to avoid potential
- * UAF without serialized by xa_lock.
+ * Bump up before making this visible to others for the XArray in order
+ * to avoid potential UAF without serialized by xa_lock.
*/
- atomic_inc(&grp->refcount);
+ lockref_get(&grp->lockref);
repeat:
xa_lock(&sbi->managed_pslots);
@@ -93,13 +90,13 @@ repeat:
if (pre) {
if (xa_is_err(pre)) {
pre = ERR_PTR(xa_err(pre));
- } else if (erofs_workgroup_get(pre)) {
+ } else if (!erofs_workgroup_get(pre)) {
/* try to legitimize the current in-tree one */
xa_unlock(&sbi->managed_pslots);
cond_resched();
goto repeat;
}
- atomic_dec(&grp->refcount);
+ lockref_put_return(&grp->lockref);
grp = pre;
}
xa_unlock(&sbi->managed_pslots);
@@ -112,38 +109,34 @@ static void __erofs_workgroup_free(struct erofs_workgroup *grp)
erofs_workgroup_free_rcu(grp);
}
-int erofs_workgroup_put(struct erofs_workgroup *grp)
+void erofs_workgroup_put(struct erofs_workgroup *grp)
{
- int count = atomic_dec_return(&grp->refcount);
+ if (lockref_put_or_lock(&grp->lockref))
+ return;
- if (count == 1)
+ DBG_BUGON(__lockref_is_dead(&grp->lockref));
+ if (grp->lockref.count == 1)
atomic_long_inc(&erofs_global_shrink_cnt);
- else if (!count)
- __erofs_workgroup_free(grp);
- return count;
+ --grp->lockref.count;
+ spin_unlock(&grp->lockref.lock);
}
static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
struct erofs_workgroup *grp)
{
- /*
- * If managed cache is on, refcount of workgroups
- * themselves could be < 0 (freezed). In other words,
- * there is no guarantee that all refcounts > 0.
- */
- if (!erofs_workgroup_try_to_freeze(grp, 1))
- return false;
+ int free = false;
+
+ spin_lock(&grp->lockref.lock);
+ if (grp->lockref.count)
+ goto out;
/*
- * Note that all cached pages should be unattached
- * before deleted from the XArray. Otherwise some
- * cached pages could be still attached to the orphan
- * old workgroup when the new one is available in the tree.
+ * Note that all cached pages should be detached before deleted from
+ * the XArray. Otherwise some cached pages could be still attached to
+ * the orphan old workgroup when the new one is available in the tree.
*/
- if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
- erofs_workgroup_unfreeze(grp, 1);
- return false;
- }
+ if (erofs_try_to_free_all_cached_pages(sbi, grp))
+ goto out;
/*
* It's impossible to fail after the workgroup is freezed,
@@ -152,10 +145,13 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
*/
DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
- /* last refcount should be connected with its managed pslot. */
- erofs_workgroup_unfreeze(grp, 0);
- __erofs_workgroup_free(grp);
- return true;
+ lockref_mark_dead(&grp->lockref);
+ free = true;
+out:
+ spin_unlock(&grp->lockref.lock);
+ if (free)
+ __erofs_workgroup_free(grp);
+ return free;
}
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index bbfe7ce170d2..40178b6e0688 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -7,32 +7,27 @@
#include <linux/security.h>
#include "xattr.h"
-static inline erofs_blk_t erofs_xattr_blkaddr(struct super_block *sb,
- unsigned int xattr_id)
-{
- return EROFS_SB(sb)->xattr_blkaddr +
- erofs_blknr(sb, xattr_id * sizeof(__u32));
-}
-
-static inline unsigned int erofs_xattr_blkoff(struct super_block *sb,
- unsigned int xattr_id)
-{
- return erofs_blkoff(sb, xattr_id * sizeof(__u32));
-}
-
-struct xattr_iter {
+struct erofs_xattr_iter {
struct super_block *sb;
struct erofs_buf buf;
+ erofs_off_t pos;
void *kaddr;
- erofs_blk_t blkaddr;
- unsigned int ofs;
+ char *buffer;
+ int buffer_size, buffer_ofs;
+
+ /* getxattr */
+ int index, infix_len;
+ struct qstr name;
+
+ /* listxattr */
+ struct dentry *dentry;
};
static int erofs_init_inode_xattrs(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
- struct xattr_iter it;
+ struct erofs_xattr_iter it;
unsigned int i;
struct erofs_xattr_ibody_header *ih;
struct super_block *sb = inode->i_sb;
@@ -81,17 +76,17 @@ static int erofs_init_inode_xattrs(struct inode *inode)
}
it.buf = __EROFS_BUF_INITIALIZER;
- it.blkaddr = erofs_blknr(sb, erofs_iloc(inode) + vi->inode_isize);
- it.ofs = erofs_blkoff(sb, erofs_iloc(inode) + vi->inode_isize);
+ erofs_init_metabuf(&it.buf, sb);
+ it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP);
if (IS_ERR(it.kaddr)) {
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
+ ih = it.kaddr + erofs_blkoff(sb, it.pos);
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
sizeof(uint), GFP_KERNEL);
@@ -102,26 +97,20 @@ static int erofs_init_inode_xattrs(struct inode *inode)
}
/* let's skip ibody header */
- it.ofs += sizeof(struct erofs_xattr_ibody_header);
+ it.pos += sizeof(struct erofs_xattr_ibody_header);
for (i = 0; i < vi->xattr_shared_count; ++i) {
- if (it.ofs >= sb->s_blocksize) {
- /* cannot be unaligned */
- DBG_BUGON(it.ofs != sb->s_blocksize);
-
- it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr,
- EROFS_KMAP);
- if (IS_ERR(it.kaddr)) {
- kfree(vi->xattr_shared_xattrs);
- vi->xattr_shared_xattrs = NULL;
- ret = PTR_ERR(it.kaddr);
- goto out_unlock;
- }
- it.ofs = 0;
+ it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos),
+ EROFS_KMAP);
+ if (IS_ERR(it.kaddr)) {
+ kfree(vi->xattr_shared_xattrs);
+ vi->xattr_shared_xattrs = NULL;
+ ret = PTR_ERR(it.kaddr);
+ goto out_unlock;
}
- vi->xattr_shared_xattrs[i] =
- le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
- it.ofs += sizeof(__le32);
+ vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)
+ (it.kaddr + erofs_blkoff(sb, it.pos)));
+ it.pos += sizeof(__le32);
}
erofs_put_metabuf(&it.buf);
@@ -134,287 +123,6 @@ out_unlock:
return ret;
}
-/*
- * the general idea for these return values is
- * if 0 is returned, go on processing the current xattr;
- * 1 (> 0) is returned, skip this round to process the next xattr;
- * -err (< 0) is returned, an error (maybe ENOXATTR) occurred
- * and need to be handled
- */
-struct xattr_iter_handlers {
- int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry);
- int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf,
- unsigned int len);
- int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz);
- void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf,
- unsigned int len);
-};
-
-static inline int xattr_iter_fixup(struct xattr_iter *it)
-{
- if (it->ofs < it->sb->s_blocksize)
- return 0;
-
- it->blkaddr += erofs_blknr(it->sb, it->ofs);
- it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
- EROFS_KMAP);
- if (IS_ERR(it->kaddr))
- return PTR_ERR(it->kaddr);
- it->ofs = erofs_blkoff(it->sb, it->ofs);
- return 0;
-}
-
-static int inline_xattr_iter_begin(struct xattr_iter *it,
- struct inode *inode)
-{
- struct erofs_inode *const vi = EROFS_I(inode);
- unsigned int xattr_header_sz, inline_xattr_ofs;
-
- xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) +
- sizeof(u32) * vi->xattr_shared_count;
- if (xattr_header_sz >= vi->xattr_isize) {
- DBG_BUGON(xattr_header_sz > vi->xattr_isize);
- return -ENOATTR;
- }
-
- inline_xattr_ofs = vi->inode_isize + xattr_header_sz;
-
- it->blkaddr = erofs_blknr(it->sb, erofs_iloc(inode) + inline_xattr_ofs);
- it->ofs = erofs_blkoff(it->sb, erofs_iloc(inode) + inline_xattr_ofs);
- it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
- EROFS_KMAP);
- if (IS_ERR(it->kaddr))
- return PTR_ERR(it->kaddr);
- return vi->xattr_isize - xattr_header_sz;
-}
-
-/*
- * Regardless of success or failure, `xattr_foreach' will end up with
- * `ofs' pointing to the next xattr item rather than an arbitrary position.
- */
-static int xattr_foreach(struct xattr_iter *it,
- const struct xattr_iter_handlers *op,
- unsigned int *tlimit)
-{
- struct erofs_xattr_entry entry;
- unsigned int value_sz, processed, slice;
- int err;
-
- /* 0. fixup blkaddr, ofs, ipage */
- err = xattr_iter_fixup(it);
- if (err)
- return err;
-
- /*
- * 1. read xattr entry to the memory,
- * since we do EROFS_XATTR_ALIGN
- * therefore entry should be in the page
- */
- entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs);
- if (tlimit) {
- unsigned int entry_sz = erofs_xattr_entry_size(&entry);
-
- /* xattr on-disk corruption: xattr entry beyond xattr_isize */
- if (*tlimit < entry_sz) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- *tlimit -= entry_sz;
- }
-
- it->ofs += sizeof(struct erofs_xattr_entry);
- value_sz = le16_to_cpu(entry.e_value_size);
-
- /* handle entry */
- err = op->entry(it, &entry);
- if (err) {
- it->ofs += entry.e_name_len + value_sz;
- goto out;
- }
-
- /* 2. handle xattr name (ofs will finally be at the end of name) */
- processed = 0;
-
- while (processed < entry.e_name_len) {
- if (it->ofs >= it->sb->s_blocksize) {
- DBG_BUGON(it->ofs > it->sb->s_blocksize);
-
- err = xattr_iter_fixup(it);
- if (err)
- goto out;
- it->ofs = 0;
- }
-
- slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs,
- entry.e_name_len - processed);
-
- /* handle name */
- err = op->name(it, processed, it->kaddr + it->ofs, slice);
- if (err) {
- it->ofs += entry.e_name_len - processed + value_sz;
- goto out;
- }
-
- it->ofs += slice;
- processed += slice;
- }
-
- /* 3. handle xattr value */
- processed = 0;
-
- if (op->alloc_buffer) {
- err = op->alloc_buffer(it, value_sz);
- if (err) {
- it->ofs += value_sz;
- goto out;
- }
- }
-
- while (processed < value_sz) {
- if (it->ofs >= it->sb->s_blocksize) {
- DBG_BUGON(it->ofs > it->sb->s_blocksize);
-
- err = xattr_iter_fixup(it);
- if (err)
- goto out;
- it->ofs = 0;
- }
-
- slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs,
- value_sz - processed);
- op->value(it, processed, it->kaddr + it->ofs, slice);
- it->ofs += slice;
- processed += slice;
- }
-
-out:
- /* xattrs should be 4-byte aligned (on-disk constraint) */
- it->ofs = EROFS_XATTR_ALIGN(it->ofs);
- return err < 0 ? err : 0;
-}
-
-struct getxattr_iter {
- struct xattr_iter it;
-
- char *buffer;
- int buffer_size, index, infix_len;
- struct qstr name;
-};
-
-static int erofs_xattr_long_entrymatch(struct getxattr_iter *it,
- struct erofs_xattr_entry *entry)
-{
- struct erofs_sb_info *sbi = EROFS_SB(it->it.sb);
- struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes +
- (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
-
- if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
- return -ENOATTR;
-
- if (it->index != pf->prefix->base_index ||
- it->name.len != entry->e_name_len + pf->infix_len)
- return -ENOATTR;
-
- if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len))
- return -ENOATTR;
-
- it->infix_len = pf->infix_len;
- return 0;
-}
-
-static int xattr_entrymatch(struct xattr_iter *_it,
- struct erofs_xattr_entry *entry)
-{
- struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
-
- /* should also match the infix for long name prefixes */
- if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX)
- return erofs_xattr_long_entrymatch(it, entry);
-
- if (it->index != entry->e_name_index ||
- it->name.len != entry->e_name_len)
- return -ENOATTR;
- it->infix_len = 0;
- return 0;
-}
-
-static int xattr_namematch(struct xattr_iter *_it,
- unsigned int processed, char *buf, unsigned int len)
-{
- struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
-
- if (memcmp(buf, it->name.name + it->infix_len + processed, len))
- return -ENOATTR;
- return 0;
-}
-
-static int xattr_checkbuffer(struct xattr_iter *_it,
- unsigned int value_sz)
-{
- struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
- int err = it->buffer_size < value_sz ? -ERANGE : 0;
-
- it->buffer_size = value_sz;
- return !it->buffer ? 1 : err;
-}
-
-static void xattr_copyvalue(struct xattr_iter *_it,
- unsigned int processed,
- char *buf, unsigned int len)
-{
- struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
-
- memcpy(it->buffer + processed, buf, len);
-}
-
-static const struct xattr_iter_handlers find_xattr_handlers = {
- .entry = xattr_entrymatch,
- .name = xattr_namematch,
- .alloc_buffer = xattr_checkbuffer,
- .value = xattr_copyvalue
-};
-
-static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
-{
- int ret;
- unsigned int remaining;
-
- ret = inline_xattr_iter_begin(&it->it, inode);
- if (ret < 0)
- return ret;
-
- remaining = ret;
- while (remaining) {
- ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining);
- if (ret != -ENOATTR)
- break;
- }
- return ret ? ret : it->buffer_size;
-}
-
-static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
-{
- struct erofs_inode *const vi = EROFS_I(inode);
- struct super_block *const sb = it->it.sb;
- unsigned int i, xsid;
- int ret = -ENOATTR;
-
- for (i = 0; i < vi->xattr_shared_count; ++i) {
- xsid = vi->xattr_shared_xattrs[i];
- it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid);
- it->it.ofs = erofs_xattr_blkoff(sb, xsid);
- it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb,
- it->it.blkaddr, EROFS_KMAP);
- if (IS_ERR(it->it.kaddr))
- return PTR_ERR(it->it.kaddr);
-
- ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
- if (ret != -ENOATTR)
- break;
- }
- return ret ? ret : it->buffer_size;
-}
-
static bool erofs_xattr_user_list(struct dentry *dentry)
{
return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
@@ -425,39 +133,6 @@ static bool erofs_xattr_trusted_list(struct dentry *dentry)
return capable(CAP_SYS_ADMIN);
}
-int erofs_getxattr(struct inode *inode, int index,
- const char *name,
- void *buffer, size_t buffer_size)
-{
- int ret;
- struct getxattr_iter it;
-
- if (!name)
- return -EINVAL;
-
- ret = erofs_init_inode_xattrs(inode);
- if (ret)
- return ret;
-
- it.index = index;
- it.name.len = strlen(name);
- if (it.name.len > EROFS_NAME_LEN)
- return -ERANGE;
-
- it.it.buf = __EROFS_BUF_INITIALIZER;
- it.name.name = name;
-
- it.buffer = buffer;
- it.buffer_size = buffer_size;
-
- it.it.sb = inode->i_sb;
- ret = inline_getxattr(inode, &it);
- if (ret == -ENOATTR)
- ret = shared_getxattr(inode, &it);
- erofs_put_metabuf(&it.it.buf);
- return ret;
-}
-
static int erofs_xattr_generic_get(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *name, void *buffer, size_t size)
@@ -500,30 +175,49 @@ const struct xattr_handler *erofs_xattr_handlers[] = {
NULL,
};
-struct listxattr_iter {
- struct xattr_iter it;
-
- struct dentry *dentry;
- char *buffer;
- int buffer_size, buffer_ofs;
-};
+static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
+ unsigned int len)
+{
+ unsigned int slice, processed;
+ struct super_block *sb = it->sb;
+ void *src;
+
+ for (processed = 0; processed < len; processed += slice) {
+ it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
+ EROFS_KMAP);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ src = it->kaddr + erofs_blkoff(sb, it->pos);
+ slice = min_t(unsigned int, sb->s_blocksize -
+ erofs_blkoff(sb, it->pos), len - processed);
+ memcpy(it->buffer + it->buffer_ofs, src, slice);
+ it->buffer_ofs += slice;
+ it->pos += slice;
+ }
+ return 0;
+}
-static int xattr_entrylist(struct xattr_iter *_it,
- struct erofs_xattr_entry *entry)
+static int erofs_listxattr_foreach(struct erofs_xattr_iter *it)
{
- struct listxattr_iter *it =
- container_of(_it, struct listxattr_iter, it);
- unsigned int base_index = entry->e_name_index;
- unsigned int prefix_len, infix_len = 0;
+ struct erofs_xattr_entry entry;
+ unsigned int base_index, name_total, prefix_len, infix_len = 0;
const char *prefix, *infix = NULL;
+ int err;
+
+ /* 1. handle xattr entry */
+ entry = *(struct erofs_xattr_entry *)
+ (it->kaddr + erofs_blkoff(it->sb, it->pos));
+ it->pos += sizeof(struct erofs_xattr_entry);
- if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) {
- struct erofs_sb_info *sbi = EROFS_SB(_it->sb);
+ base_index = entry.e_name_index;
+ if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) {
+ struct erofs_sb_info *sbi = EROFS_SB(it->sb);
struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes +
- (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
+ (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
- return 1;
+ return 0;
infix = pf->prefix->infix;
infix_len = pf->infix_len;
base_index = pf->prefix->base_index;
@@ -531,120 +225,228 @@ static int xattr_entrylist(struct xattr_iter *_it,
prefix = erofs_xattr_prefix(base_index, it->dentry);
if (!prefix)
- return 1;
+ return 0;
prefix_len = strlen(prefix);
+ name_total = prefix_len + infix_len + entry.e_name_len + 1;
if (!it->buffer) {
- it->buffer_ofs += prefix_len + infix_len +
- entry->e_name_len + 1;
- return 1;
+ it->buffer_ofs += name_total;
+ return 0;
}
- if (it->buffer_ofs + prefix_len + infix_len +
- + entry->e_name_len + 1 > it->buffer_size)
+ if (it->buffer_ofs + name_total > it->buffer_size)
return -ERANGE;
memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len);
memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len);
it->buffer_ofs += prefix_len + infix_len;
- return 0;
-}
-static int xattr_namelist(struct xattr_iter *_it,
- unsigned int processed, char *buf, unsigned int len)
-{
- struct listxattr_iter *it =
- container_of(_it, struct listxattr_iter, it);
+ /* 2. handle xattr name */
+ err = erofs_xattr_copy_to_buffer(it, entry.e_name_len);
+ if (err)
+ return err;
- memcpy(it->buffer + it->buffer_ofs, buf, len);
- it->buffer_ofs += len;
+ it->buffer[it->buffer_ofs++] = '\0';
return 0;
}
-static int xattr_skipvalue(struct xattr_iter *_it,
- unsigned int value_sz)
+static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
{
- struct listxattr_iter *it =
- container_of(_it, struct listxattr_iter, it);
+ struct super_block *sb = it->sb;
+ struct erofs_xattr_entry entry;
+ unsigned int slice, processed, value_sz;
- it->buffer[it->buffer_ofs++] = '\0';
- return 1;
-}
+ /* 1. handle xattr entry */
+ entry = *(struct erofs_xattr_entry *)
+ (it->kaddr + erofs_blkoff(sb, it->pos));
+ it->pos += sizeof(struct erofs_xattr_entry);
+ value_sz = le16_to_cpu(entry.e_value_size);
-static const struct xattr_iter_handlers list_xattr_handlers = {
- .entry = xattr_entrylist,
- .name = xattr_namelist,
- .alloc_buffer = xattr_skipvalue,
- .value = NULL
-};
+ /* should also match the infix for long name prefixes */
+ if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) {
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes +
+ (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
+
+ if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
+ return -ENOATTR;
+
+ if (it->index != pf->prefix->base_index ||
+ it->name.len != entry.e_name_len + pf->infix_len)
+ return -ENOATTR;
+
+ if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len))
+ return -ENOATTR;
+
+ it->infix_len = pf->infix_len;
+ } else {
+ if (it->index != entry.e_name_index ||
+ it->name.len != entry.e_name_len)
+ return -ENOATTR;
-static int inline_listxattr(struct listxattr_iter *it)
+ it->infix_len = 0;
+ }
+
+ /* 2. handle xattr name */
+ for (processed = 0; processed < entry.e_name_len; processed += slice) {
+ it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
+ EROFS_KMAP);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ slice = min_t(unsigned int,
+ sb->s_blocksize - erofs_blkoff(sb, it->pos),
+ entry.e_name_len - processed);
+ if (memcmp(it->name.name + it->infix_len + processed,
+ it->kaddr + erofs_blkoff(sb, it->pos), slice))
+ return -ENOATTR;
+ it->pos += slice;
+ }
+
+ /* 3. handle xattr value */
+ if (!it->buffer) {
+ it->buffer_ofs = value_sz;
+ return 0;
+ }
+
+ if (it->buffer_size < value_sz)
+ return -ERANGE;
+
+ return erofs_xattr_copy_to_buffer(it, value_sz);
+}
+
+static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
+ struct inode *inode, bool getxattr)
{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ unsigned int xattr_header_sz, remaining, entry_sz;
+ erofs_off_t next_pos;
int ret;
- unsigned int remaining;
- ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry));
- if (ret < 0)
- return ret;
+ xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) +
+ sizeof(u32) * vi->xattr_shared_count;
+ if (xattr_header_sz >= vi->xattr_isize) {
+ DBG_BUGON(xattr_header_sz > vi->xattr_isize);
+ return -ENOATTR;
+ }
+
+ remaining = vi->xattr_isize - xattr_header_sz;
+ it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
- remaining = ret;
while (remaining) {
- ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining);
- if (ret)
+ it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos),
+ EROFS_KMAP);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ entry_sz = erofs_xattr_entry_size(it->kaddr +
+ erofs_blkoff(it->sb, it->pos));
+ /* xattr on-disk corruption: xattr entry beyond xattr_isize */
+ if (remaining < entry_sz) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ remaining -= entry_sz;
+ next_pos = it->pos + entry_sz;
+
+ if (getxattr)
+ ret = erofs_getxattr_foreach(it);
+ else
+ ret = erofs_listxattr_foreach(it);
+ if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
break;
+
+ it->pos = next_pos;
}
- return ret ? ret : it->buffer_ofs;
+ return ret;
}
-static int shared_listxattr(struct listxattr_iter *it)
+static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
+ struct inode *inode, bool getxattr)
{
- struct inode *const inode = d_inode(it->dentry);
struct erofs_inode *const vi = EROFS_I(inode);
- struct super_block *const sb = it->it.sb;
- unsigned int i, xsid;
- int ret = 0;
+ struct super_block *const sb = it->sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int i;
+ int ret = -ENOATTR;
for (i = 0; i < vi->xattr_shared_count; ++i) {
- xsid = vi->xattr_shared_xattrs[i];
- it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid);
- it->it.ofs = erofs_xattr_blkoff(sb, xsid);
- it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb,
- it->it.blkaddr, EROFS_KMAP);
- if (IS_ERR(it->it.kaddr))
- return PTR_ERR(it->it.kaddr);
-
- ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
- if (ret)
+ it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
+ vi->xattr_shared_xattrs[i] * sizeof(__le32);
+ it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
+ EROFS_KMAP);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ if (getxattr)
+ ret = erofs_getxattr_foreach(it);
+ else
+ ret = erofs_listxattr_foreach(it);
+ if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
break;
}
- return ret ? ret : it->buffer_ofs;
+ return ret;
+}
+
+int erofs_getxattr(struct inode *inode, int index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ int ret;
+ struct erofs_xattr_iter it;
+
+ if (!name)
+ return -EINVAL;
+
+ ret = erofs_init_inode_xattrs(inode);
+ if (ret)
+ return ret;
+
+ it.index = index;
+ it.name = (struct qstr)QSTR_INIT(name, strlen(name));
+ if (it.name.len > EROFS_NAME_LEN)
+ return -ERANGE;
+
+ it.sb = inode->i_sb;
+ it.buf = __EROFS_BUF_INITIALIZER;
+ erofs_init_metabuf(&it.buf, it.sb);
+ it.buffer = buffer;
+ it.buffer_size = buffer_size;
+ it.buffer_ofs = 0;
+
+ ret = erofs_xattr_iter_inline(&it, inode, true);
+ if (ret == -ENOATTR)
+ ret = erofs_xattr_iter_shared(&it, inode, true);
+ erofs_put_metabuf(&it.buf);
+ return ret ? ret : it.buffer_ofs;
}
-ssize_t erofs_listxattr(struct dentry *dentry,
- char *buffer, size_t buffer_size)
+ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
int ret;
- struct listxattr_iter it;
+ struct erofs_xattr_iter it;
+ struct inode *inode = d_inode(dentry);
- ret = erofs_init_inode_xattrs(d_inode(dentry));
+ ret = erofs_init_inode_xattrs(inode);
if (ret == -ENOATTR)
return 0;
if (ret)
return ret;
- it.it.buf = __EROFS_BUF_INITIALIZER;
+ it.sb = dentry->d_sb;
+ it.buf = __EROFS_BUF_INITIALIZER;
+ erofs_init_metabuf(&it.buf, it.sb);
it.dentry = dentry;
it.buffer = buffer;
it.buffer_size = buffer_size;
it.buffer_ofs = 0;
- it.it.sb = dentry->d_sb;
-
- ret = inline_listxattr(&it);
- if (ret >= 0 || ret == -ENOATTR)
- ret = shared_listxattr(&it);
- erofs_put_metabuf(&it.it.buf);
- return ret;
+ ret = erofs_xattr_iter_inline(&it, inode, false);
+ if (!ret || ret == -ENOATTR)
+ ret = erofs_xattr_iter_shared(&it, inode, false);
+ if (ret == -ENOATTR)
+ ret = 0;
+ erofs_put_metabuf(&it.buf);
+ return ret ? ret : it.buffer_ofs;
}
void erofs_xattr_prefixes_cleanup(struct super_block *sb)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 160b3da43aec..5f1890e309c6 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -5,7 +5,6 @@
* Copyright (C) 2022 Alibaba Cloud
*/
#include "compress.h"
-#include <linux/prefetch.h>
#include <linux/psi.h>
#include <linux/cpuhotplug.h>
#include <trace/events/erofs.h>
@@ -92,13 +91,8 @@ struct z_erofs_pcluster {
struct z_erofs_bvec compressed_bvecs[];
};
-/* let's avoid the valid 32-bit kernel addresses */
-
-/* the chained workgroup has't submitted io (still open) */
-#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
-/* the chained workgroup has already submitted io */
-#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)
-
+/* the end of a chain of pclusters */
+#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
#define Z_EROFS_PCLUSTER_NIL (NULL)
struct z_erofs_decompressqueue {
@@ -241,14 +235,20 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct z_erofs_bvec *bvec,
- struct page **candidate_bvpage)
+ struct page **candidate_bvpage,
+ struct page **pagepool)
{
- if (iter->cur == iter->nr) {
- if (!*candidate_bvpage)
- return -EAGAIN;
-
+ if (iter->cur >= iter->nr) {
+ struct page *nextpage = *candidate_bvpage;
+
+ if (!nextpage) {
+ nextpage = erofs_allocpage(pagepool, GFP_NOFS);
+ if (!nextpage)
+ return -ENOMEM;
+ set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
+ }
DBG_BUGON(iter->bvset->nextpage);
- iter->bvset->nextpage = *candidate_bvpage;
+ iter->bvset->nextpage = nextpage;
z_erofs_bvset_flip(iter);
iter->bvset->nextpage = NULL;
@@ -500,20 +500,6 @@ out_error_pcluster_pool:
enum z_erofs_pclustermode {
Z_EROFS_PCLUSTER_INFLIGHT,
/*
- * The current pclusters was the tail of an exist chain, in addition
- * that the previous processed chained pclusters are all decided to
- * be hooked up to it.
- * A new chain will be created for the remaining pclusters which are
- * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
- * the next pcluster cannot reuse the whole page safely for inplace I/O
- * in the following scenario:
- * ________________________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (belongs to the next pcl) | (belongs to the current pcl) |
- * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
- */
- Z_EROFS_PCLUSTER_HOOKED,
- /*
* a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
* could be dispatched into bypass queue later due to uptodated managed
* pages. All related online pages cannot be reused for inplace I/O (or
@@ -530,8 +516,8 @@ enum z_erofs_pclustermode {
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
* | (of the current cl) | (of the previous collection) |
- * | PCLUSTER_FOLLOWED or | |
- * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
+ * | | |
+ * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
*
* [ (*) the above page can be used as inplace I/O. ]
*/
@@ -543,12 +529,12 @@ struct z_erofs_decompress_frontend {
struct erofs_map_blocks map;
struct z_erofs_bvec_iter biter;
+ struct page *pagepool;
struct page *candidate_bvpage;
- struct z_erofs_pcluster *pcl, *tailpcl;
+ struct z_erofs_pcluster *pcl;
z_erofs_next_pcluster_t owned_head;
enum z_erofs_pclustermode mode;
- bool readahead;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
@@ -578,8 +564,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
return false;
}
-static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
- struct page **pagepool)
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
@@ -620,7 +605,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
* succeeds or fallback to in-place I/O instead
* to avoid any direct reclaim.
*/
- newpage = erofs_allocpage(pagepool, gfp);
+ newpage = erofs_allocpage(&fe->pagepool, gfp);
if (!newpage)
continue;
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
@@ -633,7 +618,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
if (page)
put_page(page);
else if (newpage)
- erofs_pagepool_add(pagepool, newpage);
+ erofs_pagepool_add(&fe->pagepool, newpage);
}
/*
@@ -654,7 +639,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
/*
- * refcount of workgroup is now freezed as 1,
+ * refcount of workgroup is now freezed as 0,
* therefore no need to worry about available decompression users.
*/
for (i = 0; i < pcl->pclusterpages; ++i) {
@@ -678,29 +663,73 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
return 0;
}
-int erofs_try_to_free_cached_page(struct page *page)
+static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
{
- struct z_erofs_pcluster *const pcl = (void *)page_private(page);
- int ret, i;
+ struct z_erofs_pcluster *pcl = folio_get_private(folio);
+ bool ret;
+ int i;
- if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
- return 0;
+ if (!folio_test_private(folio))
+ return true;
+
+ ret = false;
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->obj.lockref.count > 0)
+ goto out;
- ret = 0;
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
for (i = 0; i < pcl->pclusterpages; ++i) {
- if (pcl->compressed_bvecs[i].page == page) {
+ if (pcl->compressed_bvecs[i].page == &folio->page) {
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- ret = 1;
+ ret = true;
break;
}
}
- erofs_workgroup_unfreeze(&pcl->obj, 1);
if (ret)
- detach_page_private(page);
+ folio_detach_private(folio);
+out:
+ spin_unlock(&pcl->obj.lockref.lock);
return ret;
}
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * An extra lock could be introduced instead but it seems unnecessary.
+ */
+static void z_erofs_cache_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
+{
+ const size_t stop = length + offset;
+
+ /* Check for potential overflow in debug mode */
+ DBG_BUGON(stop > folio_size(folio) || stop < length);
+
+ if (offset == 0 && stop == folio_size(folio))
+ while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+ cond_resched();
+}
+
+static const struct address_space_operations z_erofs_cache_aops = {
+ .release_folio = z_erofs_cache_release_folio,
+ .invalidate_folio = z_erofs_cache_invalidate_folio,
+};
+
+int erofs_init_managed_cache(struct super_block *sb)
+{
+ struct inode *const inode = new_inode(sb);
+
+ if (!inode)
+ return -ENOMEM;
+
+ set_nlink(inode, 1);
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &z_erofs_cache_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ EROFS_SB(sb)->managed_cache = inode;
+ return 0;
+}
+
static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec)
{
@@ -731,7 +760,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
!fe->candidate_bvpage)
fe->candidate_bvpage = bvec->page;
}
- ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
+ ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
+ &fe->pagepool);
fe->pcl->vcnt += (ret >= 0);
return ret;
}
@@ -750,19 +780,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
return;
}
- /*
- * type 2, link to the end of an existing open chain, be careful
- * that its submission is controlled by the original attached chain.
- */
- if (*owned_head != &pcl->next && pcl != f->tailpcl &&
- cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
- *owned_head = Z_EROFS_PCLUSTER_TAIL;
- f->mode = Z_EROFS_PCLUSTER_HOOKED;
- f->tailpcl = NULL;
- return;
- }
- /* type 3, it belongs to a chain, but it isn't the end of the chain */
+ /* type 2, it belongs to an ongoing chain */
f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
}
@@ -786,7 +804,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- atomic_set(&pcl->obj.refcount, 1);
+ spin_lock_init(&pcl->obj.lockref.lock);
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
@@ -823,9 +841,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
goto err_out;
}
}
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = pcl;
fe->owned_head = &pcl->next;
fe->pcl = pcl;
return 0;
@@ -846,7 +861,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
- DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (!(map->m_flags & EROFS_MAP_META)) {
grp = erofs_find_workgroup(fe->inode->i_sb,
@@ -865,10 +879,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
if (ret == -EEXIST) {
mutex_lock(&fe->pcl->lock);
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = fe->pcl;
-
z_erofs_try_to_claim_pcluster(fe);
} else if (ret) {
return ret;
@@ -908,10 +918,8 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
z_erofs_bvec_iter_end(&fe->biter);
mutex_unlock(&pcl->lock);
- if (fe->candidate_bvpage) {
- DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
+ if (fe->candidate_bvpage)
fe->candidate_bvpage = NULL;
- }
/*
* if all pending pages are added, don't hold its reference
@@ -958,7 +966,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, struct page **pagepool)
+ struct page *page)
{
struct inode *const inode = fe->inode;
struct erofs_map_blocks *const map = &fe->map;
@@ -1016,7 +1024,7 @@ repeat:
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
- z_erofs_bind_cache(fe, pagepool);
+ z_erofs_bind_cache(fe);
}
hitted:
/*
@@ -1025,8 +1033,7 @@ hitted:
* those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or bvpage (should be processed in a strict order.)
*/
- tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
- fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -1056,24 +1063,13 @@ hitted:
if (cur)
tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
-retry:
err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
.page = page,
.offset = offset - map->m_la,
.end = end,
}), exclusive);
- /* should allocate an additional short-lived page for bvset */
- if (err == -EAGAIN && !fe->candidate_bvpage) {
- fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
- set_page_private(fe->candidate_bvpage,
- Z_EROFS_SHORTLIVED_PAGE);
- goto retry;
- }
-
- if (err) {
- DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
+ if (err)
goto out;
- }
z_erofs_onlinepage_split(page);
/* bump up the number of spiltted parts of a page */
@@ -1104,7 +1100,7 @@ out:
return err;
}
-static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
+static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
unsigned int readahead_pages)
{
/* auto: enable for read_folio, disable for readahead */
@@ -1283,6 +1279,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ const struct z_erofs_decompressor *decompressor =
+ &erofs_decompressors[pcl->algorithmformat];
unsigned int i, inputsize;
int err2;
struct page *page;
@@ -1326,7 +1324,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
else
inputsize = pclusterpages * PAGE_SIZE;
- err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
+ err = decompressor->decompress(&(struct z_erofs_decompress_req) {
.sb = be->sb,
.in = be->compressed_pages,
.out = be->decompressed_pages,
@@ -1404,10 +1402,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
};
z_erofs_next_pcluster_t owned = io->head;
- while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
- /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
- DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
- /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
+ while (owned != Z_EROFS_PCLUSTER_TAIL) {
DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
be.pcl = container_of(owned, struct z_erofs_pcluster, next);
@@ -1424,7 +1419,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
container_of(work, struct z_erofs_decompressqueue, u.work);
struct page *pagepool = NULL;
- DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
z_erofs_decompress_queue(bgq, &pagepool);
erofs_release_pages(&pagepool);
kvfree(bgq);
@@ -1452,7 +1447,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
if (atomic_add_return(bios, &io->pending_bios))
return;
/* Use (kthread_)work and sync decompression for atomic contexts only */
- if (in_atomic() || irqs_disabled()) {
+ if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
struct kthread_worker *worker;
@@ -1612,7 +1607,7 @@ fg_out:
q->sync = true;
}
q->sb = sb;
- q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+ q->head = Z_EROFS_PCLUSTER_TAIL;
return q;
}
@@ -1630,11 +1625,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- if (owned_head == Z_EROFS_PCLUSTER_TAIL)
- owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
-
- WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
WRITE_ONCE(*submit_qtail, owned_head);
WRITE_ONCE(*bypass_qtail, &pcl->next);
@@ -1668,9 +1659,8 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
}
static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
- struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
- bool *force_fg)
+ bool *force_fg, bool readahead)
{
struct super_block *sb = f->inode->i_sb;
struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
@@ -1705,15 +1695,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
unsigned int i = 0;
bool bypass = true;
- /* no possible 'owned_head' equals the following */
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
-
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+ owned_head = READ_ONCE(pcl->next);
- /* close the main owned chain at first */
- owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (z_erofs_is_inline_pcluster(pcl)) {
move_to_bypass_jobqueue(pcl, qtail, owned_head);
continue;
@@ -1731,8 +1716,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
do {
struct page *page;
- page = pickup_page_for_submission(pcl, i++, pagepool,
- mc);
+ page = pickup_page_for_submission(pcl, i++,
+ &f->pagepool, mc);
if (!page)
continue;
@@ -1761,7 +1746,7 @@ submit_bio_retry:
bio->bi_iter.bi_sector = (sector_t)cur <<
(sb->s_blocksize_bits - 9);
bio->bi_private = q[JQ_SUBMIT];
- if (f->readahead)
+ if (readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
}
@@ -1797,16 +1782,16 @@ submit_bio_retry:
}
static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
- struct page **pagepool, bool force_fg)
+ bool force_fg, bool ra)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(f, pagepool, io, &force_fg);
+ z_erofs_submit_queue(f, io, &force_fg, ra);
/* handle bypass queue (no i/o pclusters) immediately */
- z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
+ z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
if (!force_fg)
return;
@@ -1815,7 +1800,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
- z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
+ z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
}
/*
@@ -1823,29 +1808,28 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
* approximate readmore strategies as a start.
*/
static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
- struct readahead_control *rac,
- erofs_off_t end,
- struct page **pagepool,
- bool backmost)
+ struct readahead_control *rac, bool backmost)
{
struct inode *inode = f->inode;
struct erofs_map_blocks *map = &f->map;
- erofs_off_t cur;
+ erofs_off_t cur, end, headoffset = f->headoffset;
int err;
if (backmost) {
+ if (rac)
+ end = headoffset + readahead_length(rac) - 1;
+ else
+ end = headoffset + PAGE_SIZE - 1;
map->m_la = end;
err = z_erofs_map_blocks_iter(inode, map,
EROFS_GET_BLOCKS_READMORE);
if (err)
return;
- /* expend ra for the trailing edge if readahead */
+ /* expand ra for the trailing edge if readahead */
if (rac) {
- loff_t newstart = readahead_pos(rac);
-
cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
- readahead_expand(rac, newstart, cur - newstart);
+ readahead_expand(rac, headoffset, cur - headoffset);
return;
}
end = round_up(end, PAGE_SIZE);
@@ -1866,7 +1850,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
if (PageUptodate(page)) {
unlock_page(page);
} else {
- err = z_erofs_do_read_page(f, page, pagepool);
+ err = z_erofs_do_read_page(f, page);
if (err)
erofs_err(inode->i_sb,
"readmore error at page %lu @ nid %llu",
@@ -1887,28 +1871,24 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
struct inode *const inode = page->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *pagepool = NULL;
int err;
trace_erofs_readpage(page, false);
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
- z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
- &pagepool, true);
- err = z_erofs_do_read_page(&f, page, &pagepool);
- z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
-
+ z_erofs_pcluster_readmore(&f, NULL, true);
+ err = z_erofs_do_read_page(&f, page);
+ z_erofs_pcluster_readmore(&f, NULL, false);
(void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(&f, &pagepool,
- z_erofs_get_sync_decompress_policy(sbi, 0));
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
erofs_put_metabuf(&f.map.buf);
- erofs_release_pages(&pagepool);
+ erofs_release_pages(&f.pagepool);
return err;
}
@@ -1917,14 +1897,12 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *pagepool = NULL, *head = NULL, *page;
+ struct page *head = NULL, *page;
unsigned int nr_pages;
- f.readahead = true;
f.headoffset = readahead_pos(rac);
- z_erofs_pcluster_readmore(&f, rac, f.headoffset +
- readahead_length(rac) - 1, &pagepool, true);
+ z_erofs_pcluster_readmore(&f, rac, true);
nr_pages = readahead_count(rac);
trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
@@ -1940,20 +1918,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
/* traversal in reverse order */
head = (void *)page_private(page);
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
if (err)
erofs_err(inode->i_sb,
"readahead error at page %lu @ nid %llu",
page->index, EROFS_I(inode)->nid);
put_page(page);
}
- z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
+ z_erofs_pcluster_readmore(&f, rac, false);
(void)z_erofs_collector_end(&f);
- z_erofs_runqueue(&f, &pagepool,
- z_erofs_get_sync_decompress_policy(sbi, nr_pages));
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true);
erofs_put_metabuf(&f.map.buf);
- erofs_release_pages(&pagepool);
+ erofs_release_pages(&f.pagepool);
}
const struct address_space_operations z_erofs_aops = {
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index d37c5c89c728..1909ddafd9c7 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -22,8 +22,8 @@ struct z_erofs_maprecorder {
bool partialref;
};
-static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned long lcn)
+static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
+ unsigned long lcn)
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
@@ -129,7 +129,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
u8 *in, type;
bool big_pcluster;
- if (1 << amortizedshift == 4)
+ if (1 << amortizedshift == 4 && lclusterbits <= 14)
vcnt = 2;
else if (1 << amortizedshift == 2 && lclusterbits == 12)
vcnt = 16;
@@ -226,12 +226,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
return 0;
}
-static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned long lcn, bool lookahead)
+static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
+ unsigned long lcn, bool lookahead)
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
unsigned int totalidx = erofs_iblks(inode);
@@ -239,9 +238,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int amortizedshift;
erofs_off_t pos;
- if (lclusterbits != 12)
- return -EOPNOTSUPP;
-
if (lcn >= totalidx)
return -EINVAL;
@@ -281,23 +277,23 @@ out:
return unpack_compacted_index(m, amortizedshift, pos, lookahead);
}
-static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned int lcn, bool lookahead)
+static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned int lcn, bool lookahead)
{
- const unsigned int datamode = EROFS_I(m->inode)->datalayout;
-
- if (datamode == EROFS_INODE_COMPRESSED_FULL)
- return legacy_load_cluster_from_disk(m, lcn);
-
- if (datamode == EROFS_INODE_COMPRESSED_COMPACT)
- return compacted_load_cluster_from_disk(m, lcn, lookahead);
-
- return -EINVAL;
+ switch (EROFS_I(m->inode)->datalayout) {
+ case EROFS_INODE_COMPRESSED_FULL:
+ return z_erofs_load_full_lcluster(m, lcn);
+ case EROFS_INODE_COMPRESSED_COMPACT:
+ return z_erofs_load_compact_lcluster(m, lcn, lookahead);
+ default:
+ return -EINVAL;
+ }
}
static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned int lookback_distance)
{
+ struct super_block *sb = m->inode->i_sb;
struct erofs_inode *const vi = EROFS_I(m->inode);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
@@ -305,21 +301,15 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned long lcn = m->lcn - lookback_distance;
int err;
- /* load extent head logical cluster if needed */
- err = z_erofs_load_cluster_from_disk(m, lcn, false);
+ err = z_erofs_load_lcluster_from_disk(m, lcn, false);
if (err)
return err;
switch (m->type) {
case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
- if (!m->delta[0]) {
- erofs_err(m->inode->i_sb,
- "invalid lookback distance 0 @ nid %llu",
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
lookback_distance = m->delta[0];
+ if (!lookback_distance)
+ goto err_bogus;
continue;
case Z_EROFS_LCLUSTER_TYPE_PLAIN:
case Z_EROFS_LCLUSTER_TYPE_HEAD1:
@@ -328,16 +318,15 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
return 0;
default:
- erofs_err(m->inode->i_sb,
- "unknown type %u @ lcn %lu of nid %llu",
+ erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
m->type, lcn, vi->nid);
DBG_BUGON(1);
return -EOPNOTSUPP;
}
}
-
- erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu",
- vi->nid);
+err_bogus:
+ erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
+ lookback_distance, m->lcn, vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -369,7 +358,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
if (m->compressedblks)
goto out;
- err = z_erofs_load_cluster_from_disk(m, lcn, false);
+ err = z_erofs_load_lcluster_from_disk(m, lcn, false);
if (err)
return err;
@@ -401,9 +390,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
break;
fallthrough;
default:
- erofs_err(m->inode->i_sb,
- "cannot found CBLKCNT @ lcn %lu of nid %llu",
- lcn, vi->nid);
+ erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn,
+ vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -411,9 +399,7 @@ out:
map->m_plen = erofs_pos(sb, m->compressedblks);
return 0;
err_bonus_cblkcnt:
- erofs_err(m->inode->i_sb,
- "bogus CBLKCNT @ lcn %lu of nid %llu",
- lcn, vi->nid);
+ erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -434,7 +420,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return 0;
}
- err = z_erofs_load_cluster_from_disk(m, lcn, true);
+ err = z_erofs_load_lcluster_from_disk(m, lcn, true);
if (err)
return err;
@@ -481,7 +467,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
- err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false);
+ err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false);
if (err)
goto unmap_out;
@@ -539,8 +525,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (flags & EROFS_GET_BLOCKS_FINDTAIL) {
vi->z_tailextent_headlcn = m.lcn;
/* for non-compact indexes, fragmentoff is 64 bits */
- if (fragment &&
- vi->datalayout == EROFS_INODE_COMPRESSED_FULL)
+ if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL)
vi->z_fragmentoff |= (u64)m.pblk << 32;
}
if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 95850a13ce8d..8aa36cd37351 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -33,17 +33,17 @@ struct eventfd_ctx {
/*
* Every time that a write(2) is performed on an eventfd, the
* value of the __u64 being written is added to "count" and a
- * wakeup is performed on "wqh". A read(2) will return the "count"
- * value to userspace, and will reset "count" to zero. The kernel
- * side eventfd_signal() also, adds to the "count" counter and
- * issue a wakeup.
+ * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not
+ * specified, a read(2) will return the "count" value to userspace,
+ * and will reset "count" to zero. The kernel side eventfd_signal()
+ * also, adds to the "count" counter and issue a wakeup.
*/
__u64 count;
unsigned int flags;
int id;
};
-__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
+__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
{
unsigned long flags;
@@ -301,6 +301,8 @@ static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
(unsigned long long)ctx->count);
spin_unlock_irq(&ctx->wqh.lock);
seq_printf(m, "eventfd-id: %d\n", ctx->id);
+ seq_printf(m, "eventfd-semaphore: %d\n",
+ !!(ctx->flags & EFD_SEMAPHORE));
}
#endif
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 980483455cc0..4b1b3362f697 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -536,7 +536,7 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
#else
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
- unsigned pollflags)
+ __poll_t pollflags)
{
wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}
@@ -1805,7 +1805,11 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
{
int ret = default_wake_function(wq_entry, mode, sync, key);
- list_del_init(&wq_entry->entry);
+ /*
+ * Pairs with list_empty_careful in ep_poll, and ensures future loop
+ * iterations see the cause of this wakeup.
+ */
+ list_del_init_careful(&wq_entry->entry);
return ret;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c1edde817be8..1f72f977c6db 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -324,17 +324,15 @@ static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb,
struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
ext4_group_t group)
{
- struct ext4_group_info **grp_info;
- long indexv, indexh;
-
- if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) {
- ext4_error(sb, "invalid group %u", group);
- return NULL;
- }
- indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
- indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
- grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
- return grp_info[indexh];
+ struct ext4_group_info **grp_info;
+ long indexv, indexh;
+
+ if (unlikely(group >= EXT4_SB(sb)->s_groups_count))
+ return NULL;
+ indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+ indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+ return grp_info[indexh];
}
/*
@@ -886,7 +884,10 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
if (!ext4_bg_has_super(sb, group))
return 0;
- return EXT4_SB(sb)->s_gdb_count;
+ if (ext4_has_feature_meta_bg(sb))
+ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ else
+ return EXT4_SB(sb)->s_gdb_count;
}
/**
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 45b579805c95..0caf6c730ce3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3834,19 +3834,10 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
return retval;
}
- /*
- * We need to protect against old.inode directory getting converted
- * from inline directory format into a normal one.
- */
- if (S_ISDIR(old.inode->i_mode))
- inode_lock_nested(old.inode, I_MUTEX_NONDIR2);
-
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
&old.inlined);
- if (IS_ERR(old.bh)) {
- retval = PTR_ERR(old.bh);
- goto unlock_moved_dir;
- }
+ if (IS_ERR(old.bh))
+ return PTR_ERR(old.bh);
/*
* Check for inode number is _not_ due to possible IO errors.
@@ -4043,10 +4034,6 @@ release_bh:
brelse(old.bh);
brelse(new.bh);
-unlock_moved_dir:
- if (S_ISDIR(old.inode->i_mode))
- inode_unlock(old.inode);
-
return retval;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 56a5d1c469fc..05fcecc36244 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6388,7 +6388,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
struct ext4_mount_options old_opts;
ext4_group_t g;
int err = 0;
- int enable_rw = 0;
#ifdef CONFIG_QUOTA
int enable_quota = 0;
int i, j;
@@ -6575,7 +6574,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
if (err)
goto restore_opts;
- enable_rw = 1;
+ sb->s_flags &= ~SB_RDONLY;
if (ext4_has_feature_mmp(sb)) {
err = ext4_multi_mount_protect(sb,
le64_to_cpu(es->s_mmp_block));
@@ -6622,9 +6621,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
ext4_release_system_zone(sb);
- if (enable_rw)
- sb->s_flags &= ~SB_RDONLY;
-
/*
* Reinitialize lazy itable initialization thread based on
* current settings
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 13d7f17a9c8c..321e3a888c20 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2056,8 +2056,9 @@ inserted:
else {
u32 ref;
+#ifdef EXT4_XATTR_DEBUG
WARN_ON_ONCE(dquot_initialize_needed(inode));
-
+#endif
/* The old block is released after updating
the inode. */
error = dquot_alloc_block(inode,
@@ -2120,8 +2121,9 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal, block;
+#ifdef EXT4_XATTR_DEBUG
WARN_ON_ONCE(dquot_initialize_needed(inode));
-
+#endif
goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
block = ext4_new_meta_blocks(handle, inode, goal, 0,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 77a71276ecb1..ad597b417fea 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -995,20 +995,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto out;
}
- /*
- * Copied from ext4_rename: we need to protect against old.inode
- * directory getting converted from inline directory format into
- * a normal one.
- */
- if (S_ISDIR(old_inode->i_mode))
- inode_lock_nested(old_inode, I_MUTEX_NONDIR2);
-
err = -ENOENT;
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry) {
if (IS_ERR(old_page))
err = PTR_ERR(old_page);
- goto out_unlock_old;
+ goto out;
}
if (S_ISDIR(old_inode->i_mode)) {
@@ -1116,9 +1108,6 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
f2fs_unlock_op(sbi);
- if (S_ISDIR(old_inode->i_mode))
- inode_unlock(old_inode);
-
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
f2fs_sync_fs(sbi->sb, 1);
@@ -1133,9 +1122,6 @@ out_dir:
f2fs_put_page(old_dir_page, 0);
out_old:
f2fs_put_page(old_page, 0);
-out_unlock_old:
- if (S_ISDIR(old_inode->i_mode))
- inode_unlock(old_inode);
out:
iput(whiteout);
return err;
diff --git a/fs/file_table.c b/fs/file_table.c
index 372653b92617..e06c68e2d757 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -44,18 +44,40 @@ static struct kmem_cache *filp_cachep __read_mostly;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
+/* Container for backing file with optional real path */
+struct backing_file {
+ struct file file;
+ struct path real_path;
+};
+
+static inline struct backing_file *backing_file(struct file *f)
+{
+ return container_of(f, struct backing_file, file);
+}
+
+struct path *backing_file_real_path(struct file *f)
+{
+ return &backing_file(f)->real_path;
+}
+EXPORT_SYMBOL_GPL(backing_file_real_path);
+
static void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_rcuhead);
put_cred(f->f_cred);
- kmem_cache_free(filp_cachep, f);
+ if (unlikely(f->f_mode & FMODE_BACKING))
+ kfree(backing_file(f));
+ else
+ kmem_cache_free(filp_cachep, f);
}
static inline void file_free(struct file *f)
{
security_file_free(f);
- if (!(f->f_mode & FMODE_NOACCOUNT))
+ if (unlikely(f->f_mode & FMODE_BACKING))
+ path_put(backing_file_real_path(f));
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
percpu_counter_dec(&nr_files);
call_rcu(&f->f_rcuhead, file_free_rcu);
}
@@ -131,20 +153,15 @@ static int __init init_fs_stat_sysctls(void)
fs_initcall(init_fs_stat_sysctls);
#endif
-static struct file *__alloc_file(int flags, const struct cred *cred)
+static int init_file(struct file *f, int flags, const struct cred *cred)
{
- struct file *f;
int error;
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
- if (unlikely(!f))
- return ERR_PTR(-ENOMEM);
-
f->f_cred = get_cred(cred);
error = security_file_alloc(f);
if (unlikely(error)) {
file_free_rcu(&f->f_rcuhead);
- return ERR_PTR(error);
+ return error;
}
atomic_long_set(&f->f_count, 1);
@@ -155,7 +172,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
- return f;
+ return 0;
}
/* Find an unused file structure and return a pointer to it.
@@ -172,6 +189,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
{
static long old_max;
struct file *f;
+ int error;
/*
* Privileged users can go above max_files
@@ -185,9 +203,15 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
goto over;
}
- f = __alloc_file(flags, cred);
- if (!IS_ERR(f))
- percpu_counter_inc(&nr_files);
+ f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ if (unlikely(!f))
+ return ERR_PTR(-ENOMEM);
+
+ error = init_file(f, flags, cred);
+ if (unlikely(error))
+ return ERR_PTR(error);
+
+ percpu_counter_inc(&nr_files);
return f;
@@ -203,18 +227,51 @@ over:
/*
* Variant of alloc_empty_file() that doesn't check and modify nr_files.
*
- * Should not be used unless there's a very good reason to do so.
+ * This is only for kernel internal use, and the allocate file must not be
+ * installed into file tables or such.
*/
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
- struct file *f = __alloc_file(flags, cred);
+ struct file *f;
+ int error;
- if (!IS_ERR(f))
- f->f_mode |= FMODE_NOACCOUNT;
+ f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ if (unlikely(!f))
+ return ERR_PTR(-ENOMEM);
+
+ error = init_file(f, flags, cred);
+ if (unlikely(error))
+ return ERR_PTR(error);
+
+ f->f_mode |= FMODE_NOACCOUNT;
return f;
}
+/*
+ * Variant of alloc_empty_file() that allocates a backing_file container
+ * and doesn't check and modify nr_files.
+ *
+ * This is only for kernel internal use, and the allocate file must not be
+ * installed into file tables or such.
+ */
+struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
+{
+ struct backing_file *ff;
+ int error;
+
+ ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
+ if (unlikely(!ff))
+ return ERR_PTR(-ENOMEM);
+
+ error = init_file(&ff->file, flags, cred);
+ if (unlikely(error))
+ return ERR_PTR(error);
+
+ ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
+ return &ff->file;
+}
+
/**
* alloc_file - allocate and initialize a 'struct file'
*
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 24ce12f0db32..851214d1d013 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -561,7 +561,8 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -ENOMEM;
}
- ctx->legacy_data[size++] = ',';
+ if (size)
+ ctx->legacy_data[size++] = ',';
len = strlen(param->key);
memcpy(ctx->legacy_data + size, param->key, len);
size += len;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 300844f50dcd..cb62c8f07d1e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -784,9 +784,13 @@ static inline bool should_fault_in_pages(struct iov_iter *i,
if (!user_backed_iter(i))
return false;
+ /*
+ * Try to fault in multiple pages initially. When that doesn't result
+ * in any progress, fall back to a single page.
+ */
size = PAGE_SIZE;
offs = offset_in_page(iocb->ki_pos);
- if (*prev_count != count || !*window_size) {
+ if (*prev_count != count) {
size_t nr_dirtied;
nr_dirtied = max(current->nr_dirtied_pause -
@@ -870,6 +874,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
struct gfs2_inode *ip = GFS2_I(inode);
size_t prev_count = 0, window_size = 0;
size_t written = 0;
+ bool enough_retries;
ssize_t ret;
/*
@@ -913,11 +918,17 @@ retry:
if (ret > 0)
written = ret;
+ enough_retries = prev_count == iov_iter_count(from) &&
+ window_size <= PAGE_SIZE;
if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
gfs2_glock_dq(gh);
window_size -= fault_in_iov_iter_readable(from, window_size);
- if (window_size)
- goto retry;
+ if (window_size) {
+ if (!enough_retries)
+ goto retry;
+ /* fall back to buffered I/O */
+ ret = 0;
+ }
}
out_unlock:
if (gfs2_holder_queued(gh))
diff --git a/fs/inode.c b/fs/inode.c
index 577799b7855f..53ae3b76d232 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1104,9 +1104,51 @@ void discard_new_inode(struct inode *inode)
EXPORT_SYMBOL(discard_new_inode);
/**
+ * lock_two_inodes - lock two inodes (may be regular files but also dirs)
+ *
+ * Lock any non-NULL argument. The caller must make sure that if he is passing
+ * in two directories, one is not ancestor of the other. Zero, one or two
+ * objects may be locked by this function.
+ *
+ * @inode1: first inode to lock
+ * @inode2: second inode to lock
+ * @subclass1: inode lock subclass for the first lock obtained
+ * @subclass2: inode lock subclass for the second lock obtained
+ */
+void lock_two_inodes(struct inode *inode1, struct inode *inode2,
+ unsigned subclass1, unsigned subclass2)
+{
+ if (!inode1 || !inode2) {
+ /*
+ * Make sure @subclass1 will be used for the acquired lock.
+ * This is not strictly necessary (no current caller cares) but
+ * let's keep things consistent.
+ */
+ if (!inode1)
+ swap(inode1, inode2);
+ goto lock;
+ }
+
+ /*
+ * If one object is directory and the other is not, we must make sure
+ * to lock directory first as the other object may be its child.
+ */
+ if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) {
+ if (inode1 > inode2)
+ swap(inode1, inode2);
+ } else if (!S_ISDIR(inode1->i_mode))
+ swap(inode1, inode2);
+lock:
+ if (inode1)
+ inode_lock_nested(inode1, subclass1);
+ if (inode2 && inode2 != inode1)
+ inode_lock_nested(inode2, subclass2);
+}
+
+/**
* lock_two_nondirectories - take two i_mutexes on non-directory objects
*
- * Lock any non-NULL argument that is not a directory.
+ * Lock any non-NULL argument. Passed objects must not be directories.
* Zero, one or two objects may be locked by this function.
*
* @inode1: first inode to lock
@@ -1114,13 +1156,9 @@ EXPORT_SYMBOL(discard_new_inode);
*/
void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
- if (inode1 > inode2)
- swap(inode1, inode2);
-
- if (inode1 && !S_ISDIR(inode1->i_mode))
- inode_lock(inode1);
- if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
- inode_lock_nested(inode2, I_MUTEX_NONDIR2);
+ WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
+ WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
+ lock_two_inodes(inode1, inode2, I_MUTEX_NORMAL, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);
@@ -1131,10 +1169,14 @@ EXPORT_SYMBOL(lock_two_nondirectories);
*/
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
- if (inode1 && !S_ISDIR(inode1->i_mode))
+ if (inode1) {
+ WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
inode_unlock(inode1);
- if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
+ }
+ if (inode2 && inode2 != inode1) {
+ WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
inode_unlock(inode2);
+ }
}
EXPORT_SYMBOL(unlock_two_nondirectories);
diff --git a/fs/internal.h b/fs/internal.h
index bd3b2810a36b..f7a3dc111026 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -97,8 +97,9 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
/*
* file_table.c
*/
-extern struct file *alloc_empty_file(int, const struct cred *);
-extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
+struct file *alloc_empty_file(int flags, const struct cred *cred);
+struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
+struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
static inline void put_file_access(struct file *file)
{
@@ -121,6 +122,47 @@ extern bool mount_capable(struct fs_context *);
int sb_init_dio_done_wq(struct super_block *sb);
/*
+ * Prepare superblock for changing its read-only state (i.e., either remount
+ * read-write superblock read-only or vice versa). After this function returns
+ * mnt_is_readonly() will return true for any mount of the superblock if its
+ * caller is able to observe any changes done by the remount. This holds until
+ * sb_end_ro_state_change() is called.
+ */
+static inline void sb_start_ro_state_change(struct super_block *sb)
+{
+ WRITE_ONCE(sb->s_readonly_remount, 1);
+ /*
+ * For RO->RW transition, the barrier pairs with the barrier in
+ * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
+ * cleared, it will see s_readonly_remount set.
+ * For RW->RO transition, the barrier pairs with the barrier in
+ * __mnt_want_write() before the mnt_is_readonly() check. The barrier
+ * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already
+ * cleared, it will see s_readonly_remount set.
+ */
+ smp_wmb();
+}
+
+/*
+ * Ends section changing read-only state of the superblock. After this function
+ * returns if mnt_is_readonly() returns false, the caller will be able to
+ * observe all the changes remount did to the superblock.
+ */
+static inline void sb_end_ro_state_change(struct super_block *sb)
+{
+ /*
+ * This barrier provides release semantics that pairs with
+ * the smp_rmb() acquire semantics in mnt_is_readonly().
+ * This barrier pair ensure that when mnt_is_readonly() sees
+ * 0 for sb->s_readonly_remount, it will also see all the
+ * preceding flag changes that were made during the RO state
+ * change.
+ */
+ smp_wmb();
+ WRITE_ONCE(sb->s_readonly_remount, 0);
+}
+
+/*
* open.c
*/
struct open_flags {
@@ -152,6 +194,8 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
bool in_group_or_capable(struct mnt_idmap *idmap,
const struct inode *inode, vfsgid_t vfsgid);
+void lock_two_inodes(struct inode *inode1, struct inode *inode2,
+ unsigned subclass1, unsigned subclass2);
/*
* fs-writeback.c
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 837cd55fd4c5..6ae9d6fefb86 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -211,7 +211,10 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
ic->scan_dents = NULL;
cond_resched();
}
- jffs2_build_xattr_subsystem(c);
+ ret = jffs2_build_xattr_subsystem(c);
+ if (ret)
+ goto exit;
+
c->flags &= ~JFFS2_SB_FLAG_BUILDING;
dbg_fsbuild("FS build complete\n");
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index aa4048a27f31..3b6bdc9a49e1 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -772,10 +772,10 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
}
#define XREF_TMPHASH_SIZE (128)
-void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
+int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
{
struct jffs2_xattr_ref *ref, *_ref;
- struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE];
+ struct jffs2_xattr_ref **xref_tmphash;
struct jffs2_xattr_datum *xd, *_xd;
struct jffs2_inode_cache *ic;
struct jffs2_raw_node_ref *raw;
@@ -784,9 +784,12 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+ xref_tmphash = kcalloc(XREF_TMPHASH_SIZE,
+ sizeof(struct jffs2_xattr_ref *), GFP_KERNEL);
+ if (!xref_tmphash)
+ return -ENOMEM;
+
/* Phase.1 : Merge same xref */
- for (i=0; i < XREF_TMPHASH_SIZE; i++)
- xref_tmphash[i] = NULL;
for (ref=c->xref_temp; ref; ref=_ref) {
struct jffs2_xattr_ref *tmp;
@@ -884,6 +887,8 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
"%u of xref (%u dead, %u orphan) found.\n",
xdatum_count, xdatum_unchecked_count, xdatum_orphan_count,
xref_count, xref_dead_count, xref_orphan_count);
+ kfree(xref_tmphash);
+ return 0;
}
struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 720007b2fd65..1b5030a3349d 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -71,7 +71,7 @@ static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref)
#ifdef CONFIG_JFFS2_FS_XATTR
extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
-extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
+extern int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c);
extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
@@ -103,7 +103,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
#else
#define jffs2_init_xattr_subsystem(c)
-#define jffs2_build_xattr_subsystem(c)
+#define jffs2_build_xattr_subsystem(c) (0)
#define jffs2_clear_xattr_subsystem(c)
#define jffs2_xattr_do_crccheck_inode(c, ic)
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index b29d68b5eec5..494b9f4043cf 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -876,7 +876,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
tid_t tid;
ino_t ino = 0;
struct component_name dname;
- int ssize; /* source pathname size */
+ u32 ssize; /* source pathname size */
struct btstack btstack;
struct inode *ip = d_inode(dentry);
s64 xlen = 0;
@@ -957,7 +957,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
if (ssize > sizeof (JFS_IP(ip)->i_inline))
JFS_IP(ip)->mode2 &= ~INLINEEA;
- jfs_info("jfs_symlink: fast symlink added ssize:%d name:%s ",
+ jfs_info("jfs_symlink: fast symlink added ssize:%u name:%s ",
ssize, name);
}
/*
@@ -987,7 +987,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
ip->i_size = ssize - 1;
while (ssize) {
/* This is kind of silly since PATH_MAX == 4K */
- int copy_size = min(ssize, PSIZE);
+ u32 copy_size = min_t(u32, ssize, PSIZE);
mp = get_metapage(ip, xaddr, PSIZE, 1);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 04ba95b83d16..22d3ff3818f5 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -355,7 +355,6 @@ static int lockd_get(void)
int error;
if (nlmsvc_serv) {
- svc_get(nlmsvc_serv);
nlmsvc_users++;
return 0;
}
diff --git a/fs/namei.c b/fs/namei.c
index e4fe0879ae55..91171da719c5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3028,8 +3028,8 @@ static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
return p;
}
- inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
- inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+ lock_two_inodes(p1->d_inode, p2->d_inode,
+ I_MUTEX_PARENT, I_MUTEX_PARENT2);
return NULL;
}
@@ -3703,7 +3703,7 @@ static int vfs_tmpfile(struct mnt_idmap *idmap,
}
/**
- * vfs_tmpfile_open - open a tmpfile for kernel internal use
+ * kernel_tmpfile_open - open a tmpfile for kernel internal use
* @idmap: idmap of the mount the inode was found from
* @parentpath: path of the base directory
* @mode: mode of the new tmpfile
@@ -3714,24 +3714,26 @@ static int vfs_tmpfile(struct mnt_idmap *idmap,
* hence this is only for kernel internal use, and must not be installed into
* file tables or such.
*/
-struct file *vfs_tmpfile_open(struct mnt_idmap *idmap,
- const struct path *parentpath,
- umode_t mode, int open_flag, const struct cred *cred)
+struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
+ const struct path *parentpath,
+ umode_t mode, int open_flag,
+ const struct cred *cred)
{
struct file *file;
int error;
file = alloc_empty_file_noaccount(open_flag, cred);
- if (!IS_ERR(file)) {
- error = vfs_tmpfile(idmap, parentpath, file, mode);
- if (error) {
- fput(file);
- file = ERR_PTR(error);
- }
+ if (IS_ERR(file))
+ return file;
+
+ error = vfs_tmpfile(idmap, parentpath, file, mode);
+ if (error) {
+ fput(file);
+ file = ERR_PTR(error);
}
return file;
}
-EXPORT_SYMBOL(vfs_tmpfile_open);
+EXPORT_SYMBOL(kernel_tmpfile_open);
static int do_tmpfile(struct nameidata *nd, unsigned flags,
const struct open_flags *op,
@@ -4731,7 +4733,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
* story.
* c) we have to lock _four_ objects - parents and victim (if it exists),
- * and source (if it is not a directory).
+ * and source.
* And that - after we got ->i_mutex on parents (until then we don't know
* whether the target exists). Solution: try to be smart with locking
* order for inodes. We rely on the fact that tree topology may change
@@ -4815,10 +4817,16 @@ int vfs_rename(struct renamedata *rd)
take_dentry_name_snapshot(&old_name, old_dentry);
dget(new_dentry);
- if (!is_dir || (flags & RENAME_EXCHANGE))
- lock_two_nondirectories(source, target);
- else if (target)
- inode_lock(target);
+ /*
+ * Lock all moved children. Moved directories may need to change parent
+ * pointer so they need the lock to prevent against concurrent
+ * directory changes moving parent pointer. For regular files we've
+ * historically always done this. The lockdep locking subclasses are
+ * somewhat arbitrary but RENAME_EXCHANGE in particular can swap
+ * regular files and directories so it's difficult to tell which
+ * subclasses to use.
+ */
+ lock_two_inodes(source, target, I_MUTEX_NORMAL, I_MUTEX_NONDIR2);
error = -EPERM;
if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
@@ -4866,9 +4874,9 @@ int vfs_rename(struct renamedata *rd)
d_exchange(old_dentry, new_dentry);
}
out:
- if (!is_dir || (flags & RENAME_EXCHANGE))
- unlock_two_nondirectories(source, target);
- else if (target)
+ if (source)
+ inode_unlock(source);
+ if (target)
inode_unlock(target);
dput(new_dentry);
if (!error) {
diff --git a/fs/namespace.c b/fs/namespace.c
index 54847db5b819..e157efc54023 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -309,9 +309,16 @@ static unsigned int mnt_get_writers(struct mount *mnt)
static int mnt_is_readonly(struct vfsmount *mnt)
{
- if (mnt->mnt_sb->s_readonly_remount)
+ if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
return 1;
- /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
+ /*
+ * The barrier pairs with the barrier in sb_start_ro_state_change()
+ * making sure if we don't see s_readonly_remount set yet, we also will
+ * not see any superblock / mount flag changes done by remount.
+ * It also pairs with the barrier in sb_end_ro_state_change()
+ * assuring that if we see s_readonly_remount already cleared, we will
+ * see the values of superblock / mount flags updated by remount.
+ */
smp_rmb();
return __mnt_is_readonly(mnt);
}
@@ -364,9 +371,11 @@ int __mnt_want_write(struct vfsmount *m)
}
}
/*
- * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
- * be set to match its requirements. So we must not load that until
- * MNT_WRITE_HOLD is cleared.
+ * The barrier pairs with the barrier sb_start_ro_state_change() making
+ * sure that if we see MNT_WRITE_HOLD cleared, we will also see
+ * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
+ * mnt_is_readonly() and bail in case we are racing with remount
+ * read-only.
*/
smp_rmb();
if (mnt_is_readonly(m)) {
@@ -588,10 +597,8 @@ int sb_prepare_remount_readonly(struct super_block *sb)
if (!err && atomic_long_read(&sb->s_remove_count))
err = -EBUSY;
- if (!err) {
- sb->s_readonly_remount = 1;
- smp_wmb();
- }
+ if (!err)
+ sb_start_ro_state_change(sb);
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
@@ -658,9 +665,25 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
return false;
}
-/*
- * find the first mount at @dentry on vfsmount @mnt.
- * call under rcu_read_lock()
+/**
+ * __lookup_mnt - find first child mount
+ * @mnt: parent mount
+ * @dentry: mountpoint
+ *
+ * If @mnt has a child mount @c mounted @dentry find and return it.
+ *
+ * Note that the child mount @c need not be unique. There are cases
+ * where shadow mounts are created. For example, during mount
+ * propagation when a source mount @mnt whose root got overmounted by a
+ * mount @o after path lookup but before @namespace_sem could be
+ * acquired gets copied and propagated. So @mnt gets copied including
+ * @o. When @mnt is propagated to a destination mount @d that already
+ * has another mount @n mounted at the same mountpoint then the source
+ * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
+ * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
+ * on @dentry.
+ *
+ * Return: The first child of @mnt mounted @dentry or NULL.
*/
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
@@ -910,6 +933,33 @@ void mnt_set_mountpoint(struct mount *mnt,
hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}
+/**
+ * mnt_set_mountpoint_beneath - mount a mount beneath another one
+ *
+ * @new_parent: the source mount
+ * @top_mnt: the mount beneath which @new_parent is mounted
+ * @new_mp: the new mountpoint of @top_mnt on @new_parent
+ *
+ * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
+ * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
+ * @new_mp. And mount @new_parent on the old parent and old
+ * mountpoint of @top_mnt.
+ *
+ * Context: This function expects namespace_lock() and lock_mount_hash()
+ * to have been acquired in that order.
+ */
+static void mnt_set_mountpoint_beneath(struct mount *new_parent,
+ struct mount *top_mnt,
+ struct mountpoint *new_mp)
+{
+ struct mount *old_top_parent = top_mnt->mnt_parent;
+ struct mountpoint *old_top_mp = top_mnt->mnt_mp;
+
+ mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
+ mnt_change_mountpoint(new_parent, new_mp, top_mnt);
+}
+
+
static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
hlist_add_head_rcu(&mnt->mnt_hash,
@@ -917,15 +967,42 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent)
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}
-/*
- * vfsmount lock must be held for write
+/**
+ * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
+ * list of child mounts
+ * @parent: the parent
+ * @mnt: the new mount
+ * @mp: the new mountpoint
+ * @beneath: whether to mount @mnt beneath or on top of @parent
+ *
+ * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
+ * to @parent's child mount list and to @mount_hashtable.
+ *
+ * If @beneath is true, remove @mnt from its current parent and
+ * mountpoint and mount it on @mp on @parent, and mount @parent on the
+ * old parent and old mountpoint of @mnt. Finally, attach @parent to
+ * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
+ *
+ * Note, when __attach_mnt() is called @mnt->mnt_parent already points
+ * to the correct parent.
+ *
+ * Context: This function expects namespace_lock() and lock_mount_hash()
+ * to have been acquired in that order.
*/
-static void attach_mnt(struct mount *mnt,
- struct mount *parent,
- struct mountpoint *mp)
+static void attach_mnt(struct mount *mnt, struct mount *parent,
+ struct mountpoint *mp, bool beneath)
{
- mnt_set_mountpoint(parent, mp, mnt);
- __attach_mnt(mnt, parent);
+ if (beneath)
+ mnt_set_mountpoint_beneath(mnt, parent, mp);
+ else
+ mnt_set_mountpoint(parent, mp, mnt);
+ /*
+ * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
+ * beneath @parent then @mnt will need to be attached to
+ * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
+ * isn't the same mount as @parent.
+ */
+ __attach_mnt(mnt, mnt->mnt_parent);
}
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
@@ -937,7 +1014,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
hlist_del_init(&mnt->mnt_mp_list);
hlist_del_init_rcu(&mnt->mnt_hash);
- attach_mnt(mnt, parent, mp);
+ attach_mnt(mnt, parent, mp, false);
put_mountpoint(old_mp);
mnt_add_count(old_parent, -1);
@@ -1767,6 +1844,19 @@ bool may_mount(void)
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
+/**
+ * path_mounted - check whether path is mounted
+ * @path: path to check
+ *
+ * Determine whether @path refers to the root of a mount.
+ *
+ * Return: true if @path is the root of a mount, false if not.
+ */
+static inline bool path_mounted(const struct path *path)
+{
+ return path->mnt->mnt_root == path->dentry;
+}
+
static void warn_mandlock(void)
{
pr_warn_once("=======================================================\n"
@@ -1782,7 +1872,7 @@ static int can_umount(const struct path *path, int flags)
if (!may_mount())
return -EPERM;
- if (path->dentry != path->mnt->mnt_root)
+ if (!path_mounted(path))
return -EINVAL;
if (!check_mnt(mnt))
return -EINVAL;
@@ -1925,7 +2015,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
goto out;
lock_mount_hash();
list_add_tail(&q->mnt_list, &res->mnt_list);
- attach_mnt(q, parent, p->mnt_mp);
+ attach_mnt(q, parent, p->mnt_mp, false);
unlock_mount_hash();
}
}
@@ -2134,12 +2224,17 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
return 0;
}
-/*
- * @source_mnt : mount tree to be attached
- * @nd : place the mount tree @source_mnt is attached
- * @parent_nd : if non-null, detach the source_mnt from its parent and
- * store the parent mount and mountpoint dentry.
- * (done when source_mnt is moved)
+enum mnt_tree_flags_t {
+ MNT_TREE_MOVE = BIT(0),
+ MNT_TREE_BENEATH = BIT(1),
+};
+
+/**
+ * attach_recursive_mnt - attach a source mount tree
+ * @source_mnt: mount tree to be attached
+ * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath
+ * @dest_mp: the mountpoint @source_mnt will be mounted at
+ * @flags: modify how @source_mnt is supposed to be attached
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
@@ -2196,22 +2291,28 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
* applied to each mount in the tree.
* Must be called without spinlocks held, since this function can sleep
* in allocations.
+ *
+ * Context: The function expects namespace_lock() to be held.
+ * Return: If @source_mnt was successfully attached 0 is returned.
+ * Otherwise a negative error code is returned.
*/
static int attach_recursive_mnt(struct mount *source_mnt,
- struct mount *dest_mnt,
- struct mountpoint *dest_mp,
- bool moving)
+ struct mount *top_mnt,
+ struct mountpoint *dest_mp,
+ enum mnt_tree_flags_t flags)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
HLIST_HEAD(tree_list);
- struct mnt_namespace *ns = dest_mnt->mnt_ns;
+ struct mnt_namespace *ns = top_mnt->mnt_ns;
struct mountpoint *smp;
- struct mount *child, *p;
+ struct mount *child, *dest_mnt, *p;
struct hlist_node *n;
- int err;
+ int err = 0;
+ bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
- /* Preallocate a mountpoint in case the new mounts need
- * to be tucked under other mounts.
+ /*
+ * Preallocate a mountpoint in case the new mounts need to be
+ * mounted beneath mounts on the same mountpoint.
*/
smp = get_mountpoint(source_mnt->mnt.mnt_root);
if (IS_ERR(smp))
@@ -2224,29 +2325,41 @@ static int attach_recursive_mnt(struct mount *source_mnt,
goto out;
}
+ if (beneath)
+ dest_mnt = top_mnt->mnt_parent;
+ else
+ dest_mnt = top_mnt;
+
if (IS_MNT_SHARED(dest_mnt)) {
err = invent_group_ids(source_mnt, true);
if (err)
goto out;
err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
- lock_mount_hash();
- if (err)
- goto out_cleanup_ids;
+ }
+ lock_mount_hash();
+ if (err)
+ goto out_cleanup_ids;
+
+ if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
- } else {
- lock_mount_hash();
}
+
if (moving) {
+ if (beneath)
+ dest_mp = smp;
unhash_mnt(source_mnt);
- attach_mnt(source_mnt, dest_mnt, dest_mp);
+ attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
/* move from anon - the caller will destroy */
list_del_init(&source_mnt->mnt_ns->list);
}
- mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
+ if (beneath)
+ mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
+ else
+ mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
commit_tree(source_mnt);
}
@@ -2286,33 +2399,101 @@ static int attach_recursive_mnt(struct mount *source_mnt,
return err;
}
-static struct mountpoint *lock_mount(struct path *path)
+/**
+ * do_lock_mount - lock mount and mountpoint
+ * @path: target path
+ * @beneath: whether the intention is to mount beneath @path
+ *
+ * Follow the mount stack on @path until the top mount @mnt is found. If
+ * the initial @path->{mnt,dentry} is a mountpoint lookup the first
+ * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
+ * until nothing is stacked on top of it anymore.
+ *
+ * Acquire the inode_lock() on the top mount's ->mnt_root to protect
+ * against concurrent removal of the new mountpoint from another mount
+ * namespace.
+ *
+ * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
+ * @mp on @mnt->mnt_parent must be acquired. This protects against a
+ * concurrent unlink of @mp->mnt_dentry from another mount namespace
+ * where @mnt doesn't have a child mount mounted @mp. A concurrent
+ * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
+ * on top of it for @beneath.
+ *
+ * In addition, @beneath needs to make sure that @mnt hasn't been
+ * unmounted or moved from its current mountpoint in between dropping
+ * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
+ * being unmounted would be detected later by e.g., calling
+ * check_mnt(mnt) in the function it's called from. For the @beneath
+ * case however, it's useful to detect it directly in do_lock_mount().
+ * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
+ * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
+ * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
+ *
+ * Return: Either the target mountpoint on the top mount or the top
+ * mount's mountpoint.
+ */
+static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
{
- struct vfsmount *mnt;
- struct dentry *dentry = path->dentry;
-retry:
- inode_lock(dentry->d_inode);
- if (unlikely(cant_mount(dentry))) {
- inode_unlock(dentry->d_inode);
- return ERR_PTR(-ENOENT);
- }
- namespace_lock();
- mnt = lookup_mnt(path);
- if (likely(!mnt)) {
- struct mountpoint *mp = get_mountpoint(dentry);
- if (IS_ERR(mp)) {
+ struct vfsmount *mnt = path->mnt;
+ struct dentry *dentry;
+ struct mountpoint *mp = ERR_PTR(-ENOENT);
+
+ for (;;) {
+ struct mount *m;
+
+ if (beneath) {
+ m = real_mount(mnt);
+ read_seqlock_excl(&mount_lock);
+ dentry = dget(m->mnt_mountpoint);
+ read_sequnlock_excl(&mount_lock);
+ } else {
+ dentry = path->dentry;
+ }
+
+ inode_lock(dentry->d_inode);
+ if (unlikely(cant_mount(dentry))) {
+ inode_unlock(dentry->d_inode);
+ goto out;
+ }
+
+ namespace_lock();
+
+ if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
namespace_unlock();
inode_unlock(dentry->d_inode);
- return mp;
+ goto out;
}
- return mp;
+
+ mnt = lookup_mnt(path);
+ if (likely(!mnt))
+ break;
+
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ if (beneath)
+ dput(dentry);
+ path_put(path);
+ path->mnt = mnt;
+ path->dentry = dget(mnt->mnt_root);
}
- namespace_unlock();
- inode_unlock(path->dentry->d_inode);
- path_put(path);
- path->mnt = mnt;
- dentry = path->dentry = dget(mnt->mnt_root);
- goto retry;
+
+ mp = get_mountpoint(dentry);
+ if (IS_ERR(mp)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ }
+
+out:
+ if (beneath)
+ dput(dentry);
+
+ return mp;
+}
+
+static inline struct mountpoint *lock_mount(struct path *path)
+{
+ return do_lock_mount(path, false);
}
static void unlock_mount(struct mountpoint *where)
@@ -2336,7 +2517,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
d_is_dir(mnt->mnt.mnt_root))
return -ENOTDIR;
- return attach_recursive_mnt(mnt, p, mp, false);
+ return attach_recursive_mnt(mnt, p, mp, 0);
}
/*
@@ -2367,7 +2548,7 @@ static int do_change_type(struct path *path, int ms_flags)
int type;
int err = 0;
- if (path->dentry != path->mnt->mnt_root)
+ if (!path_mounted(path))
return -EINVAL;
type = flags_to_propagation_type(ms_flags);
@@ -2643,7 +2824,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
if (!check_mnt(mnt))
return -EINVAL;
- if (path->dentry != mnt->mnt.mnt_root)
+ if (!path_mounted(path))
return -EINVAL;
if (!can_change_locked_flags(mnt, mnt_flags))
@@ -2682,7 +2863,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
if (!check_mnt(mnt))
return -EINVAL;
- if (path->dentry != path->mnt->mnt_root)
+ if (!path_mounted(path))
return -EINVAL;
if (!can_change_locked_flags(mnt, mnt_flags))
@@ -2772,9 +2953,9 @@ static int do_set_group(struct path *from_path, struct path *to_path)
err = -EINVAL;
/* To and From paths should be mount roots */
- if (from_path->dentry != from_path->mnt->mnt_root)
+ if (!path_mounted(from_path))
goto out;
- if (to_path->dentry != to_path->mnt->mnt_root)
+ if (!path_mounted(to_path))
goto out;
/* Setting sharing groups is only allowed across same superblock */
@@ -2818,7 +2999,110 @@ out:
return err;
}
-static int do_move_mount(struct path *old_path, struct path *new_path)
+/**
+ * path_overmounted - check if path is overmounted
+ * @path: path to check
+ *
+ * Check if path is overmounted, i.e., if there's a mount on top of
+ * @path->mnt with @path->dentry as mountpoint.
+ *
+ * Context: This function expects namespace_lock() to be held.
+ * Return: If path is overmounted true is returned, false if not.
+ */
+static inline bool path_overmounted(const struct path *path)
+{
+ rcu_read_lock();
+ if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
+ rcu_read_unlock();
+ return true;
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+/**
+ * can_move_mount_beneath - check that we can mount beneath the top mount
+ * @from: mount to mount beneath
+ * @to: mount under which to mount
+ *
+ * - Make sure that @to->dentry is actually the root of a mount under
+ * which we can mount another mount.
+ * - Make sure that nothing can be mounted beneath the caller's current
+ * root or the rootfs of the namespace.
+ * - Make sure that the caller can unmount the topmost mount ensuring
+ * that the caller could reveal the underlying mountpoint.
+ * - Ensure that nothing has been mounted on top of @from before we
+ * grabbed @namespace_sem to avoid creating pointless shadow mounts.
+ * - Prevent mounting beneath a mount if the propagation relationship
+ * between the source mount, parent mount, and top mount would lead to
+ * nonsensical mount trees.
+ *
+ * Context: This function expects namespace_lock() to be held.
+ * Return: On success 0, and on error a negative error code is returned.
+ */
+static int can_move_mount_beneath(const struct path *from,
+ const struct path *to,
+ const struct mountpoint *mp)
+{
+ struct mount *mnt_from = real_mount(from->mnt),
+ *mnt_to = real_mount(to->mnt),
+ *parent_mnt_to = mnt_to->mnt_parent;
+
+ if (!mnt_has_parent(mnt_to))
+ return -EINVAL;
+
+ if (!path_mounted(to))
+ return -EINVAL;
+
+ if (IS_MNT_LOCKED(mnt_to))
+ return -EINVAL;
+
+ /* Avoid creating shadow mounts during mount propagation. */
+ if (path_overmounted(from))
+ return -EINVAL;
+
+ /*
+ * Mounting beneath the rootfs only makes sense when the
+ * semantics of pivot_root(".", ".") are used.
+ */
+ if (&mnt_to->mnt == current->fs->root.mnt)
+ return -EINVAL;
+ if (parent_mnt_to == current->nsproxy->mnt_ns->root)
+ return -EINVAL;
+
+ for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
+ if (p == mnt_to)
+ return -EINVAL;
+
+ /*
+ * If the parent mount propagates to the child mount this would
+ * mean mounting @mnt_from on @mnt_to->mnt_parent and then
+ * propagating a copy @c of @mnt_from on top of @mnt_to. This
+ * defeats the whole purpose of mounting beneath another mount.
+ */
+ if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
+ return -EINVAL;
+
+ /*
+ * If @mnt_to->mnt_parent propagates to @mnt_from this would
+ * mean propagating a copy @c of @mnt_from on top of @mnt_from.
+ * Afterwards @mnt_from would be mounted on top of
+ * @mnt_to->mnt_parent and @mnt_to would be unmounted from
+ * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
+ * already mounted on @mnt_from, @mnt_to would ultimately be
+ * remounted on top of @c. Afterwards, @mnt_from would be
+ * covered by a copy @c of @mnt_from and @c would be covered by
+ * @mnt_from itself. This defeats the whole purpose of mounting
+ * @mnt_from beneath @mnt_to.
+ */
+ if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int do_move_mount(struct path *old_path, struct path *new_path,
+ bool beneath)
{
struct mnt_namespace *ns;
struct mount *p;
@@ -2827,8 +3111,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
struct mountpoint *mp, *old_mp;
int err;
bool attached;
+ enum mnt_tree_flags_t flags = 0;
- mp = lock_mount(new_path);
+ mp = do_lock_mount(new_path, beneath);
if (IS_ERR(mp))
return PTR_ERR(mp);
@@ -2836,6 +3121,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
p = real_mount(new_path->mnt);
parent = old->mnt_parent;
attached = mnt_has_parent(old);
+ if (attached)
+ flags |= MNT_TREE_MOVE;
old_mp = old->mnt_mp;
ns = old->mnt_ns;
@@ -2855,7 +3142,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
if (old->mnt.mnt_flags & MNT_LOCKED)
goto out;
- if (old_path->dentry != old_path->mnt->mnt_root)
+ if (!path_mounted(old_path))
goto out;
if (d_is_dir(new_path->dentry) !=
@@ -2866,6 +3153,17 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
*/
if (attached && IS_MNT_SHARED(parent))
goto out;
+
+ if (beneath) {
+ err = can_move_mount_beneath(old_path, new_path, mp);
+ if (err)
+ goto out;
+
+ err = -EINVAL;
+ p = p->mnt_parent;
+ flags |= MNT_TREE_BENEATH;
+ }
+
/*
* Don't move a mount tree containing unbindable mounts to a destination
* mount which is shared.
@@ -2879,8 +3177,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
if (p == old)
goto out;
- err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
- attached);
+ err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
if (err)
goto out;
@@ -2912,7 +3209,7 @@ static int do_move_mount_old(struct path *path, const char *old_name)
if (err)
return err;
- err = do_move_mount(&old_path, path);
+ err = do_move_mount(&old_path, path, false);
path_put(&old_path);
return err;
}
@@ -2937,8 +3234,7 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
}
/* Refuse the same filesystem on the same mount point */
- if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
- path->mnt->mnt_root == path->dentry)
+ if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
return -EBUSY;
if (d_is_symlink(newmnt->mnt.mnt_root))
@@ -3079,13 +3375,10 @@ int finish_automount(struct vfsmount *m, const struct path *path)
err = -ENOENT;
goto discard_locked;
}
- rcu_read_lock();
- if (unlikely(__lookup_mnt(path->mnt, dentry))) {
- rcu_read_unlock();
+ if (path_overmounted(path)) {
err = 0;
goto discard_locked;
}
- rcu_read_unlock();
mp = get_mountpoint(dentry);
if (IS_ERR(mp)) {
err = PTR_ERR(mp);
@@ -3777,6 +4070,10 @@ SYSCALL_DEFINE5(move_mount,
if (flags & ~MOVE_MOUNT__MASK)
return -EINVAL;
+ if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
+ (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
+ return -EINVAL;
+
/* If someone gives a pathname, they aren't permitted to move
* from an fd that requires unmount as we can't get at the flag
* to clear it afterwards.
@@ -3806,7 +4103,8 @@ SYSCALL_DEFINE5(move_mount,
if (flags & MOVE_MOUNT_SET_GROUP)
ret = do_set_group(&from_path, &to_path);
else
- ret = do_move_mount(&from_path, &to_path);
+ ret = do_move_mount(&from_path, &to_path,
+ (flags & MOVE_MOUNT_BENEATH));
out_to:
path_put(&to_path);
@@ -3917,11 +4215,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
if (new_mnt == root_mnt || old_mnt == root_mnt)
goto out4; /* loop, on the same file system */
error = -EINVAL;
- if (root.mnt->mnt_root != root.dentry)
+ if (!path_mounted(&root))
goto out4; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
goto out4; /* not attached */
- if (new.mnt->mnt_root != new.dentry)
+ if (!path_mounted(&new))
goto out4; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
goto out4; /* not attached */
@@ -3939,9 +4237,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
}
/* mount old root on put_old */
- attach_mnt(root_mnt, old_mnt, old_mp);
+ attach_mnt(root_mnt, old_mnt, old_mp, false);
/* mount new_root on / */
- attach_mnt(new_mnt, root_parent, root_mp);
+ attach_mnt(new_mnt, root_parent, root_mp, false);
mnt_add_count(root_parent, -1);
touch_mnt_namespace(current->nsproxy->mnt_ns);
/* A moved mount should not expire automatically */
@@ -4124,7 +4422,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
struct mount *mnt = real_mount(path->mnt);
int err = 0;
- if (path->dentry != mnt->mnt.mnt_root)
+ if (!path_mounted(path))
return -EINVAL;
if (kattr->mnt_userns) {
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index f21259ead64b..4c9b87850ab1 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -80,6 +80,8 @@ enum {
int nfsd_drc_slab_create(void);
void nfsd_drc_slab_free(void);
+int nfsd_net_reply_cache_init(struct nfsd_net *nn);
+void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
int nfsd_cache_lookup(struct svc_rqst *);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index ae85257b4238..11a0eaa2f914 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -97,7 +97,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
goto out;
err = -EINVAL;
- if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+ if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out;
err = -ENOENT;
@@ -107,7 +107,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
dprintk("found domain %s\n", buf);
err = -EINVAL;
- if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+ if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out;
fsidtype = simple_strtoul(buf, &ep, 10);
if (*ep)
@@ -593,7 +593,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
{
/* client path expiry [flags anonuid anongid fsid] */
char *buf;
- int len;
int err;
struct auth_domain *dom = NULL;
struct svc_export exp = {}, *expp;
@@ -609,8 +608,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
/* client */
err = -EINVAL;
- len = qword_get(&mesg, buf, PAGE_SIZE);
- if (len <= 0)
+ if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out;
err = -ENOENT;
@@ -620,7 +618,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
/* path */
err = -EINVAL;
- if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+ if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out1;
err = kern_path(buf, 0, &exp.ex_path);
@@ -665,7 +663,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
goto out3;
exp.ex_fsid = an_int;
- while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
+ while (qword_get(&mesg, buf, PAGE_SIZE) > 0) {
if (strcmp(buf, "fsloc") == 0)
err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
else if (strcmp(buf, "uuid") == 0)
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index e6bb8eeb5bc2..fc8d5b7db9f8 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,8 +151,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
{
struct nfsd3_readargs *argp = rqstp->rq_argp;
struct nfsd3_readres *resp = rqstp->rq_resp;
- unsigned int len;
- int v;
dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
SVCFH_fmt(&argp->fh),
@@ -166,17 +164,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
if (argp->offset + argp->count > (u64)OFFSET_MAX)
argp->count = (u64)OFFSET_MAX - argp->offset;
- v = 0;
- len = argp->count;
resp->pages = rqstp->rq_next_page;
- while (len > 0) {
- struct page *page = *(rqstp->rq_next_page++);
-
- rqstp->rq_vec[v].iov_base = page_address(page);
- rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
- len -= rqstp->rq_vec[v].iov_len;
- v++;
- }
/* Obtain buffer pointer for payload.
* 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -187,7 +175,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
- rqstp->rq_vec, v, &resp->count, &resp->eof);
+ &resp->count, &resp->eof);
return rpc_success;
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 3308dd671ef0..f32128955ec8 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -828,7 +828,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (xdr_stream_encode_u32(xdr, resp->len) < 0)
return false;
- xdr_write_pages(xdr, resp->pages, 0, resp->len);
+ svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, 0,
+ resp->len);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
return false;
break;
@@ -859,8 +860,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
return false;
- xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
- resp->count);
+ svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages,
+ rqstp->rq_res.page_base,
+ resp->count);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
return false;
break;
@@ -961,7 +963,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (!svcxdr_encode_cookieverf3(xdr, resp->verf))
return false;
- xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
+ svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0,
+ dirlist->len);
/* no more entries */
if (xdr_stream_encode_item_absent(xdr) < 0)
return false;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 76db2fe29624..26b1343c8035 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2541,6 +2541,20 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
return p;
}
+static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr,
+ struct timespec64 *tv)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT * 3);
+ if (!p)
+ return nfserr_resource;
+
+ p = xdr_encode_hyper(p, (s64)tv->tv_sec);
+ *p = cpu_to_be32(tv->tv_nsec);
+ return nfs_ok;
+}
+
/*
* ctime (in NFSv4, time_metadata) is not writeable, and the client
* doesn't really care what resolution could theoretically be stored by
@@ -2566,12 +2580,16 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
return p;
}
-static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c)
+static __be32
+nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c)
{
- *p++ = cpu_to_be32(c->atomic);
- p = xdr_encode_hyper(p, c->before_change);
- p = xdr_encode_hyper(p, c->after_change);
- return p;
+ if (xdr_stream_encode_bool(xdr, c->atomic) < 0)
+ return nfserr_resource;
+ if (xdr_stream_encode_u64(xdr, c->before_change) < 0)
+ return nfserr_resource;
+ if (xdr_stream_encode_u64(xdr, c->after_change) < 0)
+ return nfserr_resource;
+ return nfs_ok;
}
/* Encode as an array of strings the string given with components
@@ -3348,11 +3366,9 @@ out_acl:
p = xdr_encode_hyper(p, dummy64);
}
if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec);
- *p++ = cpu_to_be32(stat.atime.tv_nsec);
+ status = nfsd4_encode_nfstime4(xdr, &stat.atime);
+ if (status)
+ goto out;
}
if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
p = xdr_reserve_space(xdr, 12);
@@ -3361,25 +3377,19 @@ out_acl:
p = encode_time_delta(p, d_inode(dentry));
}
if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec);
- *p++ = cpu_to_be32(stat.ctime.tv_nsec);
+ status = nfsd4_encode_nfstime4(xdr, &stat.ctime);
+ if (status)
+ goto out;
}
if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
- *p++ = cpu_to_be32(stat.mtime.tv_nsec);
+ status = nfsd4_encode_nfstime4(xdr, &stat.mtime);
+ if (status)
+ goto out;
}
if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec);
- *p++ = cpu_to_be32(stat.btime.tv_nsec);
+ status = nfsd4_encode_nfstime4(xdr, &stat.btime);
+ if (status)
+ goto out;
}
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
u64 ino = stat.ino;
@@ -3689,6 +3699,30 @@ fail:
}
static __be32
+nfsd4_encode_verifier4(struct xdr_stream *xdr, const nfs4_verifier *verf)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+ memcpy(p, verf->data, sizeof(verf->data));
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(__be64));
+ if (!p)
+ return nfserr_resource;
+ memcpy(p, clientid, sizeof(*clientid));
+ return nfs_ok;
+}
+
+static __be32
nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
{
__be32 *p;
@@ -3752,15 +3786,8 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_commit *commit = &u->commit;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_opaque_fixed(p, commit->co_verf.data,
- NFS4_VERIFIER_SIZE);
- return 0;
+ return nfsd4_encode_verifier4(resp->xdr, &commit->co_verf);
}
static __be32
@@ -3769,12 +3796,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_create *create = &u->create;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 20);
- if (!p)
- return nfserr_resource;
- encode_cinfo(p, &create->cr_cinfo);
+ nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo);
+ if (nfserr)
+ return nfserr;
return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
create->cr_bmval[1], create->cr_bmval[2]);
}
@@ -3892,13 +3917,8 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_link *link = &u->link;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 20);
- if (!p)
- return nfserr_resource;
- p = encode_cinfo(p, &link->li_cinfo);
- return 0;
+ return nfsd4_encode_change_info4(xdr, &link->li_cinfo);
}
@@ -3913,11 +3933,11 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
if (nfserr)
return nfserr;
- p = xdr_reserve_space(xdr, 24);
- if (!p)
+ nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
+ if (nfserr)
+ return nfserr;
+ if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0)
return nfserr_resource;
- p = encode_cinfo(p, &open->op_cinfo);
- *p++ = cpu_to_be32(open->op_rflags);
nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
open->op_bmval[2]);
@@ -3956,7 +3976,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
p = xdr_reserve_space(xdr, 32);
if (!p)
return nfserr_resource;
- *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(open->op_recall);
/*
* TODO: space_limit's in delegations
@@ -4018,6 +4038,11 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfsd4_encode_stateid(xdr, &od->od_stateid);
}
+/*
+ * The operation of this function assumes that this is the only
+ * READ operation in the COMPOUND. If there are multiple READs,
+ * we use nfsd4_encode_readv().
+ */
static __be32 nfsd4_encode_splice_read(
struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
@@ -4028,8 +4053,12 @@ static __be32 nfsd4_encode_splice_read(
int status, space_left;
__be32 nfserr;
- /* Make sure there will be room for padding if needed */
- if (xdr->end - xdr->p < 1)
+ /*
+ * Make sure there is room at the end of buf->head for
+ * svcxdr_encode_opaque_pages() to create a tail buffer
+ * to XDR-pad the payload.
+ */
+ if (xdr->iov != xdr->buf->head || xdr->end - xdr->p < 1)
return nfserr_resource;
nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
@@ -4038,6 +4067,8 @@ static __be32 nfsd4_encode_splice_read(
read->rd_length = maxcount;
if (nfserr)
goto out_err;
+ svcxdr_encode_opaque_pages(read->rd_rqstp, xdr, buf->pages,
+ buf->page_base, maxcount);
status = svc_encode_result_payload(read->rd_rqstp,
buf->head[0].iov_len, maxcount);
if (status) {
@@ -4045,31 +4076,19 @@ static __be32 nfsd4_encode_splice_read(
goto out_err;
}
- buf->page_len = maxcount;
- buf->len += maxcount;
- xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
- / PAGE_SIZE;
-
- /* Use rest of head for padding and remaining ops: */
- buf->tail[0].iov_base = xdr->p;
- buf->tail[0].iov_len = 0;
- xdr->iov = buf->tail;
- if (maxcount&3) {
- int pad = 4 - (maxcount&3);
-
- *(xdr->p++) = 0;
-
- buf->tail[0].iov_base += maxcount&3;
- buf->tail[0].iov_len = pad;
- buf->len += pad;
- }
-
+ /*
+ * Prepare to encode subsequent operations.
+ *
+ * xdr_truncate_encode() is not safe to use after a successful
+ * splice read has been done, so the following stream
+ * manipulations are open-coded.
+ */
space_left = min_t(int, (void *)xdr->end - (void *)xdr->p,
buf->buflen - buf->len);
buf->buflen = buf->len + space_left;
xdr->end = (__be32 *)((void *)xdr->end + space_left);
- return 0;
+ return nfs_ok;
out_err:
/*
@@ -4090,13 +4109,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
__be32 zero = xdr_zero;
__be32 nfserr;
- read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
- if (read->rd_vlen < 0)
+ if (xdr_reserve_space_vec(xdr, maxcount) < 0)
return nfserr_resource;
- nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
- resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
- &read->rd_eof);
+ nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file,
+ read->rd_offset, &maxcount,
+ xdr->buf->page_len & ~PAGE_MASK,
+ &read->rd_eof);
read->rd_length = maxcount;
if (nfserr)
return nfserr;
@@ -4213,15 +4232,9 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
int starting_len = xdr->buf->len;
__be32 *p;
- p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
- if (!p)
- return nfserr_resource;
-
- /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
- xdr->buf->head[0].iov_len = (char *)xdr->p -
- (char *)xdr->buf->head[0].iov_base;
+ nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
+ if (nfserr != nfs_ok)
+ return nfserr;
/*
* Number of bytes left for directory entries allowing for the
@@ -4299,13 +4312,8 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_remove *remove = &u->remove;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 20);
- if (!p)
- return nfserr_resource;
- p = encode_cinfo(p, &remove->rm_cinfo);
- return 0;
+ return nfsd4_encode_change_info4(xdr, &remove->rm_cinfo);
}
static __be32
@@ -4314,14 +4322,11 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_rename *rename = &u->rename;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 40);
- if (!p)
- return nfserr_resource;
- p = encode_cinfo(p, &rename->rn_sinfo);
- p = encode_cinfo(p, &rename->rn_tinfo);
- return 0;
+ nfserr = nfsd4_encode_change_info4(xdr, &rename->rn_sinfo);
+ if (nfserr)
+ return nfserr;
+ return nfsd4_encode_change_info4(xdr, &rename->rn_tinfo);
}
static __be32
@@ -4448,23 +4453,25 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_setclientid *scd = &u->setclientid;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
if (!nfserr) {
- p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8);
- p = xdr_encode_opaque_fixed(p, &scd->se_confirm,
- NFS4_VERIFIER_SIZE);
- }
- else if (nfserr == nfserr_clid_inuse) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
+ nfserr = nfsd4_encode_clientid4(xdr, &scd->se_clientid);
+ if (nfserr != nfs_ok)
+ goto out;
+ nfserr = nfsd4_encode_verifier4(xdr, &scd->se_confirm);
+ } else if (nfserr == nfserr_clid_inuse) {
+ /* empty network id */
+ if (xdr_stream_encode_u32(xdr, 0) < 0) {
+ nfserr = nfserr_resource;
+ goto out;
+ }
+ /* empty universal address */
+ if (xdr_stream_encode_u32(xdr, 0) < 0) {
+ nfserr = nfserr_resource;
+ goto out;
+ }
}
+out:
return nfserr;
}
@@ -4473,17 +4480,12 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_write *write = &u->write;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 16);
- if (!p)
+ if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0)
return nfserr_resource;
- *p++ = cpu_to_be32(write->wr_bytes_written);
- *p++ = cpu_to_be32(write->wr_how_written);
- p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
- NFS4_VERIFIER_SIZE);
- return 0;
+ if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0)
+ return nfserr_resource;
+ return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier);
}
static __be32
@@ -4505,20 +4507,15 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
server_scope = nn->nfsd_name;
server_scope_sz = strlen(nn->nfsd_name);
- p = xdr_reserve_space(xdr,
- 8 /* eir_clientid */ +
- 4 /* eir_sequenceid */ +
- 4 /* eir_flags */ +
- 4 /* spr_how */);
- if (!p)
+ if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok)
+ return nfserr_resource;
+ if (xdr_stream_encode_u32(xdr, exid->seqid) < 0)
+ return nfserr_resource;
+ if (xdr_stream_encode_u32(xdr, exid->flags) < 0)
return nfserr_resource;
- p = xdr_encode_opaque_fixed(p, &exid->clientid, 8);
- *p++ = cpu_to_be32(exid->seqid);
- *p++ = cpu_to_be32(exid->flags);
-
- *p++ = cpu_to_be32(exid->spa_how);
-
+ if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0)
+ return nfserr_resource;
switch (exid->spa_how) {
case SP4_NONE:
break;
@@ -5099,15 +5096,8 @@ nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_setxattr *setxattr = &u->setxattr;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 20);
- if (!p)
- return nfserr_resource;
-
- encode_cinfo(p, &setxattr->setxa_cinfo);
-
- return 0;
+ return nfsd4_encode_change_info4(xdr, &setxattr->setxa_cinfo);
}
/*
@@ -5253,14 +5243,8 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_removexattr *removexattr = &u->removexattr;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 20);
- if (!p)
- return nfserr_resource;
-
- p = encode_cinfo(p, &removexattr->rmxa_cinfo);
- return 0;
+ return nfsd4_encode_change_info4(xdr, &removexattr->rmxa_cinfo);
}
typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u);
@@ -5460,6 +5444,12 @@ status:
release:
if (opdesc && opdesc->op_release)
opdesc->op_release(&op->u);
+
+ /*
+ * Account for pages consumed while encoding this operation.
+ * The xdr_stream primitives don't manage rq_next_page.
+ */
+ rqstp->rq_next_page = xdr->page_ptr + 1;
}
/*
@@ -5528,9 +5518,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
p = resp->statusp;
*p++ = resp->cstate.status;
-
- rqstp->rq_next_page = xdr->page_ptr + 1;
-
*p++ = htonl(resp->taglen);
memcpy(p, resp->tag, resp->taglen);
p += XDR_QUADLEN(resp->taglen);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 041faa13b852..a8eda1c85829 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -148,12 +148,23 @@ void nfsd_drc_slab_free(void)
kmem_cache_destroy(drc_slab);
}
-static int nfsd_reply_cache_stats_init(struct nfsd_net *nn)
+/**
+ * nfsd_net_reply_cache_init - per net namespace reply cache set-up
+ * @nn: nfsd_net being initialized
+ *
+ * Returns zero on succes; otherwise a negative errno is returned.
+ */
+int nfsd_net_reply_cache_init(struct nfsd_net *nn)
{
return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
}
-static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn)
+/**
+ * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down
+ * @nn: nfsd_net being freed
+ *
+ */
+void nfsd_net_reply_cache_destroy(struct nfsd_net *nn)
{
nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
}
@@ -169,17 +180,13 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
hashsize = nfsd_hashsize(nn->max_drc_entries);
nn->maskbits = ilog2(hashsize);
- status = nfsd_reply_cache_stats_init(nn);
- if (status)
- goto out_nomem;
-
nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
nn->nfsd_reply_cache_shrinker.seeks = 1;
status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
"nfsd-reply:%s", nn->nfsd_name);
if (status)
- goto out_stats_destroy;
+ return status;
nn->drc_hashtbl = kvzalloc(array_size(hashsize,
sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
@@ -195,9 +202,6 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
return 0;
out_shrinker:
unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
-out_stats_destroy:
- nfsd_reply_cache_stats_destroy(nn);
-out_nomem:
printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
return -ENOMEM;
}
@@ -217,7 +221,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
rp, nn);
}
}
- nfsd_reply_cache_stats_destroy(nn);
kvfree(nn->drc_hashtbl);
nn->drc_hashtbl = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b4fd7a7062d5..1b8b1aab9a15 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -25,6 +25,7 @@
#include "netns.h"
#include "pnfs.h"
#include "filecache.h"
+#include "trace.h"
/*
* We have a single directory with several nodes in it.
@@ -109,12 +110,12 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
if (IS_ERR(data))
return PTR_ERR(data);
- rv = write_op[ino](file, data, size);
- if (rv >= 0) {
- simple_transaction_set(file, rv);
- rv = size;
- }
- return rv;
+ rv = write_op[ino](file, data, size);
+ if (rv < 0)
+ return rv;
+
+ simple_transaction_set(file, rv);
+ return size;
}
static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
@@ -230,6 +231,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
if (rpc_pton(net, fo_path, size, sap, salen) == 0)
return -EINVAL;
+ trace_nfsd_ctl_unlock_ip(net, buf);
return nlmsvc_unlock_all_by_ip(sap);
}
@@ -263,7 +265,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
fo_path = buf;
if (qword_get(&buf, fo_path, size) < 0)
return -EINVAL;
-
+ trace_nfsd_ctl_unlock_fs(netns(file), fo_path);
error = kern_path(fo_path, 0, &path);
if (error)
return error;
@@ -324,7 +326,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
len = qword_get(&mesg, dname, size);
if (len <= 0)
return -EINVAL;
-
+
path = dname+len+1;
len = qword_get(&mesg, path, size);
if (len <= 0)
@@ -338,15 +340,17 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
return -EINVAL;
maxsize = min(maxsize, NFS3_FHSIZE);
- if (qword_get(&mesg, mesg, size)>0)
+ if (qword_get(&mesg, mesg, size) > 0)
return -EINVAL;
+ trace_nfsd_ctl_filehandle(netns(file), dname, path, maxsize);
+
/* we have all the words, they are in buf.. */
dom = unix_domain_find(dname);
if (!dom)
return -ENOMEM;
- len = exp_rootfh(netns(file), dom, path, &fh, maxsize);
+ len = exp_rootfh(netns(file), dom, path, &fh, maxsize);
auth_domain_put(dom);
if (len)
return len;
@@ -399,6 +403,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
return rv;
if (newthreads < 0)
return -EINVAL;
+ trace_nfsd_ctl_threads(net, newthreads);
rv = nfsd_svc(newthreads, net, file->f_cred);
if (rv < 0)
return rv;
@@ -418,8 +423,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
* OR
*
* Input:
- * buf: C string containing whitespace-
- * separated unsigned integer values
+ * buf: C string containing whitespace-
+ * separated unsigned integer values
* representing the number of NFSD
* threads to start in each pool
* size: non-zero length of C string in @buf
@@ -471,6 +476,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
rv = -EINVAL;
if (nthreads[i] < 0)
goto out_free;
+ trace_nfsd_ctl_pool_threads(net, i, nthreads[i]);
}
rv = nfsd_set_nrthreads(i, nthreads, net);
if (rv)
@@ -526,7 +532,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
char *sep;
struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
- if (size>0) {
+ if (size > 0) {
if (nn->nfsd_serv)
/* Cannot change versions without updating
* nn->nfsd_serv->sv_xdrsize, and reallocing
@@ -536,6 +542,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
if (buf[size-1] != '\n')
return -EINVAL;
buf[size-1] = 0;
+ trace_nfsd_ctl_version(netns(file), buf);
vers = mesg;
len = qword_get(&mesg, vers, size);
@@ -637,11 +644,11 @@ out:
* OR
*
* Input:
- * buf: C string containing whitespace-
- * separated positive or negative
- * integer values representing NFS
- * protocol versions to enable ("+n")
- * or disable ("-n")
+ * buf: C string containing whitespace-
+ * separated positive or negative
+ * integer values representing NFS
+ * protocol versions to enable ("+n")
+ * or disable ("-n")
* size: non-zero length of C string in @buf
* Output:
* On success: status of zero or more protocol versions has
@@ -689,6 +696,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
err = get_int(&mesg, &fd);
if (err != 0 || fd < 0)
return -EINVAL;
+ trace_nfsd_ctl_ports_addfd(net, fd);
err = nfsd_create_serv(net);
if (err != 0)
@@ -705,7 +713,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
}
/*
- * A transport listener is added by writing it's transport name and
+ * A transport listener is added by writing its transport name and
* a port number.
*/
static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred)
@@ -720,6 +728,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
if (port < 1 || port > USHRT_MAX)
return -EINVAL;
+ trace_nfsd_ctl_ports_addxprt(net, transport, port);
err = nfsd_create_serv(net);
if (err != 0)
@@ -832,9 +841,9 @@ int nfsd_max_blksize;
* OR
*
* Input:
- * buf: C string containing an unsigned
- * integer value representing the new
- * NFS blksize
+ * buf: C string containing an unsigned
+ * integer value representing the new
+ * NFS blksize
* size: non-zero length of C string in @buf
* Output:
* On success: passed-in buffer filled with '\n'-terminated C string
@@ -853,6 +862,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
int rv = get_int(&mesg, &bsize);
if (rv)
return rv;
+ trace_nfsd_ctl_maxblksize(netns(file), bsize);
+
/* force bsize into allowed range and
* required alignment.
*/
@@ -881,9 +892,9 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
* OR
*
* Input:
- * buf: C string containing an unsigned
- * integer value representing the new
- * number of max connections
+ * buf: C string containing an unsigned
+ * integer value representing the new
+ * number of max connections
* size: non-zero length of C string in @buf
* Output:
* On success: passed-in buffer filled with '\n'-terminated C string
@@ -903,6 +914,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
if (rv)
return rv;
+ trace_nfsd_ctl_maxconn(netns(file), maxconn);
nn->max_connections = maxconn;
}
@@ -913,6 +925,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
time64_t *time, struct nfsd_net *nn)
{
+ struct dentry *dentry = file_dentry(file);
char *mesg = buf;
int rv, i;
@@ -922,6 +935,9 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
rv = get_int(&mesg, &i);
if (rv)
return rv;
+ trace_nfsd_ctl_time(netns(file), dentry->d_name.name,
+ dentry->d_name.len, i);
+
/*
* Some sanity checking. We don't have a reason for
* these particular numbers, but problems with the
@@ -1014,6 +1030,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
len = qword_get(&mesg, recdir, size);
if (len <= 0)
return -EINVAL;
+ trace_nfsd_ctl_recoverydir(netns(file), recdir);
status = nfs4_reset_recoverydir(recdir);
if (status)
@@ -1065,7 +1082,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
* OR
*
* Input:
- * buf: any value
+ * buf: any value
* size: non-zero length of C string in @buf
* Output:
* passed-in buffer filled with "Y" or "N" with a newline
@@ -1087,7 +1104,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
case '1':
if (!nn->nfsd_serv)
return -EBUSY;
- nfsd4_end_grace(nn);
+ trace_nfsd_end_grace(netns(file));
break;
default:
return -EINVAL;
@@ -1192,8 +1209,8 @@ static int __nfsd_symlink(struct inode *dir, struct dentry *dentry,
* @content is assumed to be a NUL-terminated string that lives
* longer than the symlink itself.
*/
-static void nfsd_symlink(struct dentry *parent, const char *name,
- const char *content)
+static void _nfsd_symlink(struct dentry *parent, const char *name,
+ const char *content)
{
struct inode *dir = parent->d_inode;
struct dentry *dentry;
@@ -1210,8 +1227,8 @@ out:
inode_unlock(dir);
}
#else
-static inline void nfsd_symlink(struct dentry *parent, const char *name,
- const char *content)
+static inline void _nfsd_symlink(struct dentry *parent, const char *name,
+ const char *content)
{
}
@@ -1389,8 +1406,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
if (ret)
return ret;
- nfsd_symlink(sb->s_root, "supported_krb5_enctypes",
- "/proc/net/rpc/gss_krb5_enctypes");
+ _nfsd_symlink(sb->s_root, "supported_krb5_enctypes",
+ "/proc/net/rpc/gss_krb5_enctypes");
dentry = nfsd_mkdir(sb->s_root, NULL, "clients");
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -1477,7 +1494,17 @@ static int create_proc_exports_entry(void)
unsigned int nfsd_net_id;
-static __net_init int nfsd_init_net(struct net *net)
+/**
+ * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
+ * @net: a freshly-created network namespace
+ *
+ * This information stays around as long as the network namespace is
+ * alive whether or not there is an NFSD instance running in the
+ * namespace.
+ *
+ * Returns zero on success, or a negative errno otherwise.
+ */
+static __net_init int nfsd_net_init(struct net *net)
{
int retval;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -1488,6 +1515,9 @@ static __net_init int nfsd_init_net(struct net *net)
retval = nfsd_idmap_init(net);
if (retval)
goto out_idmap_error;
+ retval = nfsd_net_reply_cache_init(nn);
+ if (retval)
+ goto out_repcache_error;
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
nfsd4_init_leases_net(nn);
@@ -1496,22 +1526,32 @@ static __net_init int nfsd_init_net(struct net *net)
return 0;
+out_repcache_error:
+ nfsd_idmap_shutdown(net);
out_idmap_error:
nfsd_export_shutdown(net);
out_export_error:
return retval;
}
-static __net_exit void nfsd_exit_net(struct net *net)
+/**
+ * nfsd_net_exit - Release the nfsd_net portion of a net namespace
+ * @net: a network namespace that is about to be destroyed
+ *
+ */
+static __net_exit void nfsd_net_exit(struct net *net)
{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfsd_net_reply_cache_destroy(nn);
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
- nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
+ nfsd_netns_free_versions(nn);
}
static struct pernet_operations nfsd_net_ops = {
- .init = nfsd_init_net,
- .exit = nfsd_exit_net,
+ .init = nfsd_net_init,
+ .exit = nfsd_net_exit,
.id = &nfsd_net_id,
.size = sizeof(struct nfsd_net),
};
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ccd8485fee04..e8e13ae72e3c 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -623,16 +623,9 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
inode = d_inode(fhp->fh_dentry);
err = fh_getattr(fhp, &stat);
- if (err) {
- /* Grab the times from inode anyway */
- stat.mtime = inode->i_mtime;
- stat.ctime = inode->i_ctime;
- stat.size = inode->i_size;
- if (v4 && IS_I_VERSION(inode)) {
- stat.change_cookie = inode_query_iversion(inode);
- stat.result_mask |= STATX_CHANGE_COOKIE;
- }
- }
+ if (err)
+ return;
+
if (v4)
fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
@@ -660,15 +653,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
printk("nfsd: inode locked twice during operation.\n");
err = fh_getattr(fhp, &fhp->fh_post_attr);
- if (err) {
- fhp->fh_post_saved = false;
- fhp->fh_post_attr.ctime = inode->i_ctime;
- if (v4 && IS_I_VERSION(inode)) {
- fhp->fh_post_attr.change_cookie = inode_query_iversion(inode);
- fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE;
- }
- } else
- fhp->fh_post_saved = true;
+ if (err)
+ return;
+
+ fhp->fh_post_saved = true;
if (v4)
fhp->fh_post_change =
nfsd4_change_attribute(&fhp->fh_post_attr, inode);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index c37195572fd0..a7315928a760 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -176,9 +176,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
{
struct nfsd_readargs *argp = rqstp->rq_argp;
struct nfsd_readres *resp = rqstp->rq_resp;
- unsigned int len;
u32 eof;
- int v;
dprintk("nfsd: READ %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
@@ -187,17 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
- v = 0;
- len = argp->count;
resp->pages = rqstp->rq_next_page;
- while (len > 0) {
- struct page *page = *(rqstp->rq_next_page++);
-
- rqstp->rq_vec[v].iov_base = page_address(page);
- rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
- len -= rqstp->rq_vec[v].iov_len;
- v++;
- }
/* Obtain buffer pointer for payload. 19 is 1 word for
* status, 17 words for fattr, and 1 word for the byte count.
@@ -207,7 +195,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
resp->count = argp->count;
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
- rqstp->rq_vec, v, &resp->count, &eof);
+ &resp->count, &eof);
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9c7b1ef5be40..2154fa63c5f2 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -402,6 +402,11 @@ void nfsd_reset_write_verifier(struct nfsd_net *nn)
write_sequnlock(&nn->writeverf_lock);
}
+/*
+ * Crank up a set of per-namespace resources for a new NFSD instance,
+ * including lockd, a duplicate reply cache, an open file cache
+ * instance, and a cache of NFSv4 state objects.
+ */
static int nfsd_startup_net(struct net *net, const struct cred *cred)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index caf6355b18fa..5777f40c7353 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -468,7 +468,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
case nfs_ok:
if (xdr_stream_encode_u32(xdr, resp->len) < 0)
return false;
- xdr_write_pages(xdr, &resp->page, 0, resp->len);
+ svcxdr_encode_opaque_pages(rqstp, xdr, &resp->page, 0,
+ resp->len);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
return false;
break;
@@ -491,8 +492,9 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
return false;
- xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
- resp->count);
+ svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages,
+ rqstp->rq_res.page_base,
+ resp->count);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
return false;
break;
@@ -511,7 +513,8 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
switch (resp->status) {
case nfs_ok:
- xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
+ svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0,
+ dirlist->len);
/* no more entries */
if (xdr_stream_encode_item_absent(xdr) < 0)
return false;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 72a906a053dc..2af74983f146 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1581,6 +1581,265 @@ TRACE_EVENT(nfsd_cb_recall_any_done,
)
);
+TRACE_EVENT(nfsd_ctl_unlock_ip,
+ TP_PROTO(
+ const struct net *net,
+ const char *address
+ ),
+ TP_ARGS(net, address),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __string(address, address)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __assign_str(address, address);
+ ),
+ TP_printk("address=%s",
+ __get_str(address)
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_unlock_fs,
+ TP_PROTO(
+ const struct net *net,
+ const char *path
+ ),
+ TP_ARGS(net, path),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __string(path, path)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __assign_str(path, path);
+ ),
+ TP_printk("path=%s",
+ __get_str(path)
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_filehandle,
+ TP_PROTO(
+ const struct net *net,
+ const char *domain,
+ const char *path,
+ int maxsize
+ ),
+ TP_ARGS(net, domain, path, maxsize),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(int, maxsize)
+ __string(domain, domain)
+ __string(path, path)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->maxsize = maxsize;
+ __assign_str(domain, domain);
+ __assign_str(path, path);
+ ),
+ TP_printk("domain=%s path=%s maxsize=%d",
+ __get_str(domain), __get_str(path), __entry->maxsize
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_threads,
+ TP_PROTO(
+ const struct net *net,
+ int newthreads
+ ),
+ TP_ARGS(net, newthreads),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(int, newthreads)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->newthreads = newthreads;
+ ),
+ TP_printk("newthreads=%d",
+ __entry->newthreads
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_pool_threads,
+ TP_PROTO(
+ const struct net *net,
+ int pool,
+ int nrthreads
+ ),
+ TP_ARGS(net, pool, nrthreads),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(int, pool)
+ __field(int, nrthreads)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->pool = pool;
+ __entry->nrthreads = nrthreads;
+ ),
+ TP_printk("pool=%d nrthreads=%d",
+ __entry->pool, __entry->nrthreads
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_version,
+ TP_PROTO(
+ const struct net *net,
+ const char *mesg
+ ),
+ TP_ARGS(net, mesg),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __string(mesg, mesg)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __assign_str(mesg, mesg);
+ ),
+ TP_printk("%s",
+ __get_str(mesg)
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_ports_addfd,
+ TP_PROTO(
+ const struct net *net,
+ int fd
+ ),
+ TP_ARGS(net, fd),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(int, fd)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->fd = fd;
+ ),
+ TP_printk("fd=%d",
+ __entry->fd
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_ports_addxprt,
+ TP_PROTO(
+ const struct net *net,
+ const char *transport,
+ int port
+ ),
+ TP_ARGS(net, transport, port),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(int, port)
+ __string(transport, transport)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->port = port;
+ __assign_str(transport, transport);
+ ),
+ TP_printk("transport=%s port=%d",
+ __get_str(transport), __entry->port
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_maxblksize,
+ TP_PROTO(
+ const struct net *net,
+ int bsize
+ ),
+ TP_ARGS(net, bsize),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(int, bsize)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->bsize = bsize;
+ ),
+ TP_printk("bsize=%d",
+ __entry->bsize
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_maxconn,
+ TP_PROTO(
+ const struct net *net,
+ int maxconn
+ ),
+ TP_ARGS(net, maxconn),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(int, maxconn)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->maxconn = maxconn;
+ ),
+ TP_printk("maxconn=%d",
+ __entry->maxconn
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_time,
+ TP_PROTO(
+ const struct net *net,
+ const char *name,
+ size_t namelen,
+ int time
+ ),
+ TP_ARGS(net, name, namelen, time),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(int, time)
+ __string_len(name, name, namelen)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __entry->time = time;
+ __assign_str_len(name, name, namelen);
+ ),
+ TP_printk("file=%s time=%d\n",
+ __get_str(name), __entry->time
+ )
+);
+
+TRACE_EVENT(nfsd_ctl_recoverydir,
+ TP_PROTO(
+ const struct net *net,
+ const char *recdir
+ ),
+ TP_ARGS(net, recdir),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __string(recdir, recdir)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ __assign_str(recdir, recdir);
+ ),
+ TP_printk("recdir=%s",
+ __get_str(recdir)
+ )
+);
+
+TRACE_EVENT(nfsd_end_grace,
+ TP_PROTO(
+ const struct net *net
+ ),
+ TP_ARGS(net),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = net->ns.inum;
+ ),
+ TP_printk("nn=%d", __entry->netns_ino
+ )
+);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index db67f8e19344..59b7d60ae33e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -388,7 +388,9 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
iap->ia_mode &= ~S_ISGID;
} else {
/* set ATTR_KILL_* bits and let VFS handle it */
- iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
+ iap->ia_valid |= ATTR_KILL_SUID;
+ iap->ia_valid |=
+ setattr_should_drop_sgid(&nop_mnt_idmap, inode);
}
}
}
@@ -1001,6 +1003,18 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
}
+/**
+ * nfsd_splice_read - Perform a VFS read using a splice pipe
+ * @rqstp: RPC transaction context
+ * @fhp: file handle of file to be read
+ * @file: opened struct file of file to be read
+ * @offset: starting byte offset
+ * @count: IN: requested number of bytes; OUT: number of bytes read
+ * @eof: OUT: set non-zero if operation reached the end of the file
+ *
+ * Returns nfs_ok on success, otherwise an nfserr stat value is
+ * returned.
+ */
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset, unsigned long *count,
u32 *eof)
@@ -1014,22 +1028,50 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
ssize_t host_err;
trace_nfsd_read_splice(rqstp, fhp, offset, *count);
- rqstp->rq_next_page = rqstp->rq_respages + 1;
host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
-__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct file *file, loff_t offset,
- struct kvec *vec, int vlen, unsigned long *count,
- u32 *eof)
+/**
+ * nfsd_iter_read - Perform a VFS read using an iterator
+ * @rqstp: RPC transaction context
+ * @fhp: file handle of file to be read
+ * @file: opened struct file of file to be read
+ * @offset: starting byte offset
+ * @count: IN: requested number of bytes; OUT: number of bytes read
+ * @base: offset in first page of read buffer
+ * @eof: OUT: set non-zero if operation reached the end of the file
+ *
+ * Some filesystems or situations cannot use nfsd_splice_read. This
+ * function is the slightly less-performant fallback for those cases.
+ *
+ * Returns nfs_ok on success, otherwise an nfserr stat value is
+ * returned.
+ */
+__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset, unsigned long *count,
+ unsigned int base, u32 *eof)
{
+ unsigned long v, total;
struct iov_iter iter;
loff_t ppos = offset;
+ struct page *page;
ssize_t host_err;
+ v = 0;
+ total = *count;
+ while (total) {
+ page = *(rqstp->rq_next_page++);
+ rqstp->rq_vec[v].iov_base = page_address(page) + base;
+ rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base);
+ total -= rqstp->rq_vec[v].iov_len;
+ ++v;
+ base = 0;
+ }
+ WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec));
+
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count);
+ iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count);
host_err = vfs_iter_read(file, &iter, &ppos, 0);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
@@ -1159,14 +1201,24 @@ out_nfserr:
return nfserr;
}
-/*
- * Read data from a file. count must contain the requested read count
- * on entry. On return, *count contains the number of bytes actually read.
+/**
+ * nfsd_read - Read data from a file
+ * @rqstp: RPC transaction context
+ * @fhp: file handle of file to be read
+ * @offset: starting byte offset
+ * @count: IN: requested number of bytes; OUT: number of bytes read
+ * @eof: OUT: set non-zero if operation reached the end of the file
+ *
+ * The caller must verify that there is enough space in @rqstp.rq_res
+ * to perform this operation.
+ *
* N.B. After this call fhp needs an fh_put
+ *
+ * Returns nfs_ok on success, otherwise an nfserr stat value is
+ * returned.
*/
__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
- loff_t offset, struct kvec *vec, int vlen, unsigned long *count,
- u32 *eof)
+ loff_t offset, unsigned long *count, u32 *eof)
{
struct nfsd_file *nf;
struct file *file;
@@ -1181,12 +1233,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
else
- err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof);
+ err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof);
nfsd_file_put(nf);
-
trace_nfsd_read_done(rqstp, fhp, offset, *count);
-
return err;
}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 43fb57a301d3..a6890ea7b765 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -110,13 +110,12 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
unsigned long *count,
u32 *eof);
-__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
+__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
- struct kvec *vec, int vlen,
- unsigned long *count,
+ unsigned long *count, unsigned int base,
u32 *eof);
-__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
- loff_t, struct kvec *, int, unsigned long *,
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ loff_t offset, unsigned long *count,
u32 *eof);
__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
struct kvec *, int, unsigned long *,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index e956f886a1a1..5710833ac1cc 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -285,6 +285,14 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
if (nbh == NULL) { /* blocksize == pagesize */
xa_erase_irq(&btnc->i_pages, newkey);
unlock_page(ctxt->bh->b_page);
- } else
- brelse(nbh);
+ } else {
+ /*
+ * When canceling a buffer that a prepare operation has
+ * allocated to copy a node block to another location, use
+ * nilfs_btnode_delete() to initialize and release the buffer
+ * so that the buffer flags will not be in an inconsistent
+ * state when it is reallocated.
+ */
+ nilfs_btnode_delete(nbh);
+ }
}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5cf30827f244..b4e54d079b7d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
struct folio *folio = fbatch.folios[i];
folio_lock(folio);
- nilfs_clear_dirty_page(&folio->page, silent);
+
+ /*
+ * This folio may have been removed from the address
+ * space by truncation or invalidation when the lock
+ * was acquired. Skip processing in that case.
+ */
+ if (likely(folio->mapping == mapping))
+ nilfs_clear_dirty_page(&folio->page, silent);
+
folio_unlock(folio);
}
folio_batch_release(&fbatch);
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1362ccb64ec7..6e59dc19a732 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
if (unlikely(!bh))
return -ENOMEM;
+ lock_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
nilfs_segbuf_add_segsum_buffer(segbuf, bh);
return 0;
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ac949fd7603f..c2553024bd25 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -981,10 +981,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
unsigned int isz, srsz;
bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+
+ lock_buffer(bh_sr);
raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
isz = nilfs->ns_inode_size;
srsz = NILFS_SR_BYTES(isz);
+ raw_sr->sr_sum = 0; /* Ensure initialization within this update */
raw_sr->sr_bytes = cpu_to_le16(srsz);
raw_sr->sr_nongc_ctime
= cpu_to_le64(nilfs_doing_gc() ?
@@ -998,6 +1001,8 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
NILFS_SR_SUFILE_OFFSET(isz), 1);
memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
+ set_buffer_uptodate(bh_sr);
+ unlock_buffer(bh_sr);
}
static void nilfs_redirty_inodes(struct list_head *head)
@@ -1780,6 +1785,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
+ clear_buffer_uptodate(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1791,6 +1797,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
b_assoc_buffers) {
clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
+ clear_buffer_uptodate(bh);
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
bd_page = bh->b_page;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index dc359b56fdfa..2c6078a6b8ec 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -779,6 +779,15 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
goto out_header;
sui->ncleansegs -= nsegs - newnsegs;
+
+ /*
+ * If the sufile is successfully truncated, immediately adjust
+ * the segment allocation space while locking the semaphore
+ * "mi_sem" so that nilfs_sufile_alloc() never allocates
+ * segments in the truncated space.
+ */
+ sui->allocmax = newnsegs - 1;
+ sui->allocmin = 0;
}
kaddr = kmap_atomic(header_bh->b_page);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 77f1e5778d1c..9ba4933087af 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -372,10 +372,31 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
goto out;
}
nsbp = (void *)nsbh->b_data + offset;
- memset(nsbp, 0, nilfs->ns_blocksize);
+ lock_buffer(nsbh);
if (sb2i >= 0) {
+ /*
+ * The position of the second superblock only changes by 4KiB,
+ * which is larger than the maximum superblock data size
+ * (= 1KiB), so there is no need to use memmove() to allow
+ * overlap between source and destination.
+ */
memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);
+
+ /*
+ * Zero fill after copy to avoid overwriting in case of move
+ * within the same block.
+ */
+ memset(nsbh->b_data, 0, offset);
+ memset((void *)nsbp + nilfs->ns_sbsize, 0,
+ nsbh->b_size - offset - nilfs->ns_sbsize);
+ } else {
+ memset(nsbh->b_data, 0, nsbh->b_size);
+ }
+ set_buffer_uptodate(nsbh);
+ unlock_buffer(nsbh);
+
+ if (sb2i >= 0) {
brelse(nilfs->ns_sbh[sb2i]);
nilfs->ns_sbh[sb2i] = nsbh;
nilfs->ns_sbp[sb2i] = nsbp;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 2894152a6b25..0f0667957c81 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -405,6 +405,18 @@ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
100));
}
+/**
+ * nilfs_max_segment_count - calculate the maximum number of segments
+ * @nilfs: nilfs object
+ */
+static u64 nilfs_max_segment_count(struct the_nilfs *nilfs)
+{
+ u64 max_count = U64_MAX;
+
+ do_div(max_count, nilfs->ns_blocks_per_segment);
+ return min_t(u64, max_count, ULONG_MAX);
+}
+
void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
{
nilfs->ns_nsegments = nsegs;
@@ -414,6 +426,8 @@ void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
+ u64 nsegments, nblocks;
+
if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
nilfs_err(nilfs->ns_sb,
"unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
@@ -457,7 +471,34 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
return -EINVAL;
}
- nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
+ nsegments = le64_to_cpu(sbp->s_nsegments);
+ if (nsegments > nilfs_max_segment_count(nilfs)) {
+ nilfs_err(nilfs->ns_sb,
+ "segment count %llu exceeds upper limit (%llu segments)",
+ (unsigned long long)nsegments,
+ (unsigned long long)nilfs_max_segment_count(nilfs));
+ return -EINVAL;
+ }
+
+ nblocks = sb_bdev_nr_blocks(nilfs->ns_sb);
+ if (nblocks) {
+ u64 min_block_count = nsegments * nilfs->ns_blocks_per_segment;
+ /*
+ * To avoid failing to mount early device images without a
+ * second superblock, exclude that block count from the
+ * "min_block_count" calculation.
+ */
+
+ if (nblocks < min_block_count) {
+ nilfs_err(nilfs->ns_sb,
+ "total number of segment blocks %llu exceeds device size (%llu blocks)",
+ (unsigned long long)min_block_count,
+ (unsigned long long)nblocks);
+ return -EINVAL;
+ }
+ }
+
+ nilfs_set_nsegments(nilfs, nsegments);
nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
return 0;
}
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index a3865bc4a0c6..f79408f9127a 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2491,7 +2491,7 @@ conv_err_out:
* byte offset @ofs inside the attribute with the constant byte @val.
*
* This function is effectively like memset() applied to an ntfs attribute.
- * Note thie function actually only operates on the page cache pages belonging
+ * Note this function actually only operates on the page cache pages belonging
* to the ntfs attribute and it marks them dirty after doing the memset().
* Thus it relies on the vm dirty page write code paths to cause the modified
* pages to be written to the mft record/disk.
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index f9cb180b6f6b..761aaa0195d6 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -161,7 +161,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[],
*/
u8 *cb_end = cb_start + cb_size; /* End of cb. */
u8 *cb = cb_start; /* Current position in cb. */
- u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */
+ u8 *cb_sb_start; /* Beginning of the current sb in the cb. */
u8 *cb_sb_end; /* End of current sb / beginning of next sb. */
/* Variables for uncompressed data / destination. */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 48030899dc6e..0155f106ec34 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1955,36 +1955,38 @@ undo_alloc:
"attribute.%s", es);
NVolSetErrors(vol);
}
- a = ctx->attr;
+
if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
ntfs_error(vol->sb, "Failed to truncate mft data attribute "
"runlist.%s", es);
NVolSetErrors(vol);
}
- if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
- if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+ if (ctx) {
+ a = ctx->attr;
+ if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
+ if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu(
a->data.non_resident.mapping_pairs_offset),
old_alen - le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
+ a->data.non_resident.mapping_pairs_offset),
rl2, ll, -1, NULL)) {
- ntfs_error(vol->sb, "Failed to restore mapping pairs "
+ ntfs_error(vol->sb, "Failed to restore mapping pairs "
"array.%s", es);
- NVolSetErrors(vol);
- }
- if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
+ NVolSetErrors(vol);
+ }
+ if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+ ntfs_error(vol->sb, "Failed to restore attribute "
"record.%s", es);
+ NVolSetErrors(vol);
+ }
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ } else if (IS_ERR(ctx->mrec)) {
+ ntfs_error(vol->sb, "Failed to restore attribute search "
+ "context.%s", es);
NVolSetErrors(vol);
}
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- } else if (IS_ERR(ctx->mrec)) {
- ntfs_error(vol->sb, "Failed to restore attribute search "
- "context.%s", es);
- NVolSetErrors(vol);
- }
- if (ctx)
ntfs_attr_put_search_ctx(ctx);
+ }
if (!IS_ERR(mrec))
unmap_mft_record(mft_ni);
up_write(&mft_ni->runlist.lock);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 2643a08182e1..56a7d5bd33e4 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1620,7 +1620,7 @@ read_partial_attrdef_page:
memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT),
page_address(page), size);
ntfs_unmap_page(page);
- };
+ }
if (size == PAGE_SIZE) {
size = i_size & ~PAGE_MASK;
if (size)
@@ -1689,7 +1689,7 @@ read_partial_upcase_page:
memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT),
page_address(page), size);
ntfs_unmap_page(page);
- };
+ }
if (size == PAGE_SIZE) {
size = i_size & ~PAGE_MASK;
if (size)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index efb09de4343d..b173c36bcab3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2100,14 +2100,20 @@ static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
struct ocfs2_space_resv sr;
int change_size = 1;
int cmd = OCFS2_IOC_RESVSP64;
+ int ret = 0;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
if (!ocfs2_writes_unwritten_extents(osb))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_KEEP_SIZE)
+ if (mode & FALLOC_FL_KEEP_SIZE) {
change_size = 0;
+ } else {
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret)
+ return ret;
+ }
if (mode & FALLOC_FL_PUNCH_HOLE)
cmd = OCFS2_IOC_UNRESVSP64;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0b0e6a132101..988d1c076861 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -952,8 +952,10 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!sb_has_quota_loaded(sb, type))
continue;
- oinfo = sb_dqinfo(sb, type)->dqi_priv;
- cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ if (!sb_has_quota_suspended(sb, type)) {
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ }
inode = igrab(sb->s_dquot.files[type]);
/* Turn off quotas. This will remove all dquot structures from
* memory and so they will be automatically synced to global
diff --git a/fs/open.c b/fs/open.c
index 4478adcc4f3a..fb07b2840eb4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -700,10 +700,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
return do_fchmodat(AT_FDCWD, filename, mode);
}
-/**
- * setattr_vfsuid - check and set ia_fsuid attribute
- * @kuid: new inode owner
- *
+/*
* Check whether @kuid is valid and if so generate and set vfsuid_t in
* ia_vfsuid.
*
@@ -718,10 +715,7 @@ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
return true;
}
-/**
- * setattr_vfsgid - check and set ia_fsgid attribute
- * @kgid: new inode owner
- *
+/*
* Check whether @kgid is valid and if so generate and set vfsgid_t in
* ia_vfsgid.
*
@@ -989,7 +983,6 @@ cleanup_file:
* @file: file pointer
* @dentry: pointer to dentry
* @open: open callback
- * @opened: state of open
*
* This can be used to finish opening a file passed to i_op->atomic_open().
*
@@ -1043,7 +1036,6 @@ EXPORT_SYMBOL(file_path);
* vfs_open - open the file at the given path
* @path: path to open
* @file: newly allocated file with f_flag initialized
- * @cred: credentials to use
*/
int vfs_open(const struct path *path, struct file *file)
{
@@ -1116,23 +1108,77 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
}
EXPORT_SYMBOL(dentry_create);
-struct file *open_with_fake_path(const struct path *path, int flags,
+/**
+ * kernel_file_open - open a file for kernel internal use
+ * @path: path of the file to open
+ * @flags: open flags
+ * @inode: the inode
+ * @cred: credentials for open
+ *
+ * Open a file for use by in-kernel consumers. The file is not accounted
+ * against nr_files and must not be installed into the file descriptor
+ * table.
+ *
+ * Return: Opened file on success, an error pointer on failure.
+ */
+struct file *kernel_file_open(const struct path *path, int flags,
struct inode *inode, const struct cred *cred)
{
- struct file *f = alloc_empty_file_noaccount(flags, cred);
- if (!IS_ERR(f)) {
- int error;
+ struct file *f;
+ int error;
- f->f_path = *path;
- error = do_dentry_open(f, inode, NULL);
- if (error) {
- fput(f);
- f = ERR_PTR(error);
- }
+ f = alloc_empty_file_noaccount(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ f->f_path = *path;
+ error = do_dentry_open(f, inode, NULL);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
}
return f;
}
-EXPORT_SYMBOL(open_with_fake_path);
+EXPORT_SYMBOL_GPL(kernel_file_open);
+
+/**
+ * backing_file_open - open a backing file for kernel internal use
+ * @path: path of the file to open
+ * @flags: open flags
+ * @path: path of the backing file
+ * @cred: credentials for open
+ *
+ * Open a backing file for a stackable filesystem (e.g., overlayfs).
+ * @path may be on the stackable filesystem and backing inode on the
+ * underlying filesystem. In this case, we want to be able to return
+ * the @real_path of the backing inode. This is done by embedding the
+ * returned file into a container structure that also stores the path of
+ * the backing inode on the underlying filesystem, which can be
+ * retrieved using backing_file_real_path().
+ */
+struct file *backing_file_open(const struct path *path, int flags,
+ const struct path *real_path,
+ const struct cred *cred)
+{
+ struct file *f;
+ int error;
+
+ f = alloc_empty_backing_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ f->f_path = *path;
+ path_get(real_path);
+ *backing_file_real_path(f) = *real_path;
+ error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
+ }
+
+ return f;
+}
+EXPORT_SYMBOL_GPL(backing_file_open);
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
@@ -1156,7 +1202,7 @@ inline struct open_how build_open_how(int flags, umode_t mode)
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
u64 flags = how->flags;
- u64 strip = FMODE_NONOTIFY | O_CLOEXEC;
+ u64 strip = __FMODE_NONOTIFY | O_CLOEXEC;
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 7c04f033aadd..dbbb156c2d67 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -34,8 +34,8 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
-/* No atime modification nor notify on underlying */
-#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
+/* No atime modification on underlying */
+#define OVL_OPEN_FLAGS (O_NOATIME)
static struct file *ovl_open_realfile(const struct file *file,
const struct path *realpath)
@@ -61,8 +61,8 @@ static struct file *ovl_open_realfile(const struct file *file,
if (!inode_owner_or_capable(real_idmap, realinode))
flags &= ~O_NOATIME;
- realfile = open_with_fake_path(&file->f_path, flags, realinode,
- current_cred());
+ realfile = backing_file_open(&file->f_path, flags, realpath,
+ current_cred());
}
revert_creds(old_cred);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 4d0b278f5630..23686e8a06c4 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -329,8 +329,9 @@ static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs,
struct dentry *dentry, umode_t mode)
{
struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry };
- struct file *file = vfs_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode,
- O_LARGEFILE | O_WRONLY, current_cred());
+ struct file *file = kernel_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path,
+ mode, O_LARGEFILE | O_WRONLY,
+ current_cred());
int err = PTR_ERR_OR_ZERO(file);
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
diff --git a/fs/pnode.c b/fs/pnode.c
index 3cede8b18c8b..e4d0340393d5 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -216,7 +216,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
static struct mount *last_dest, *first_source, *last_source, *dest_master;
static struct hlist_head *list;
-static inline bool peers(struct mount *m1, struct mount *m2)
+static inline bool peers(const struct mount *m1, const struct mount *m2)
{
return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
}
@@ -354,6 +354,46 @@ static inline int do_refcount_check(struct mount *mnt, int count)
return mnt_get_count(mnt) > count;
}
+/**
+ * propagation_would_overmount - check whether propagation from @from
+ * would overmount @to
+ * @from: shared mount
+ * @to: mount to check
+ * @mp: future mountpoint of @to on @from
+ *
+ * If @from propagates mounts to @to, @from and @to must either be peers
+ * or one of the masters in the hierarchy of masters of @to must be a
+ * peer of @from.
+ *
+ * If the root of the @to mount is equal to the future mountpoint @mp of
+ * the @to mount on @from then @to will be overmounted by whatever is
+ * propagated to it.
+ *
+ * Context: This function expects namespace_lock() to be held and that
+ * @mp is stable.
+ * Return: If @from overmounts @to, true is returned, false if not.
+ */
+bool propagation_would_overmount(const struct mount *from,
+ const struct mount *to,
+ const struct mountpoint *mp)
+{
+ if (!IS_MNT_SHARED(from))
+ return false;
+
+ if (IS_MNT_NEW(to))
+ return false;
+
+ if (to->mnt.mnt_root != mp->m_dentry)
+ return false;
+
+ for (const struct mount *m = to; m; m = m->mnt_master) {
+ if (peers(from, m))
+ return true;
+ }
+
+ return false;
+}
+
/*
* check if the mount 'mnt' can be unmounted successfully.
* @mnt: the mount to be checked for unmount
diff --git a/fs/pnode.h b/fs/pnode.h
index 988f1aa9b02a..0b02a6393891 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -53,4 +53,7 @@ struct mount *copy_tree(struct mount *, struct dentry *, int);
bool is_path_reachable(struct mount *, struct dentry *,
const struct path *root);
int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
+bool propagation_would_overmount(const struct mount *from,
+ const struct mount *to,
+ const struct mountpoint *mp);
#endif /* _LINUX_PNODE_H */
diff --git a/fs/readdir.c b/fs/readdir.c
index 9c53edb60c03..b264ce60114d 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -131,7 +131,7 @@ struct old_linux_dirent {
unsigned long d_ino;
unsigned long d_offset;
unsigned short d_namlen;
- char d_name[1];
+ char d_name[];
};
struct readdir_callback {
@@ -208,7 +208,7 @@ struct linux_dirent {
unsigned long d_ino;
unsigned long d_off;
unsigned short d_reclen;
- char d_name[1];
+ char d_name[];
};
struct getdents_callback {
@@ -388,7 +388,7 @@ struct compat_old_linux_dirent {
compat_ulong_t d_ino;
compat_ulong_t d_offset;
unsigned short d_namlen;
- char d_name[1];
+ char d_name[];
};
struct compat_readdir_callback {
@@ -460,7 +460,7 @@ struct compat_linux_dirent {
compat_ulong_t d_ino;
compat_ulong_t d_off;
unsigned short d_reclen;
- char d_name[1];
+ char d_name[];
};
struct compat_getdents_callback {
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 1331a890f2f2..87ae4f0dc3aa 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -15,6 +15,7 @@
#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/dax.h>
+#include <linux/overflow.h>
#include "internal.h"
#include <linux/uaccess.h>
@@ -101,10 +102,12 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
bool write)
{
+ loff_t tmp;
+
if (unlikely(pos < 0 || len < 0))
return -EINVAL;
- if (unlikely((loff_t) (pos + len) < 0))
+ if (unlikely(check_add_overflow(pos, len, &tmp)))
return -EINVAL;
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 5034b862cec2..b279f745466e 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
+#include <uapi/linux/ethtool.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -130,12 +131,14 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan)
struct TCP_Server_Info *server = chan->server;
seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx"
- "\n\t\tNumber of credits: %d Dialect 0x%x"
+ "\n\t\tNumber of credits: %d,%d,%d Dialect 0x%x"
"\n\t\tTCP status: %d Instance: %d"
"\n\t\tLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d"
"\n\t\tIn Send: %d In MaxReq Wait: %d",
i+1, server->conn_id,
server->credits,
+ server->echo_credits,
+ server->oplock_credits,
server->dialect,
server->tcpStatus,
server->reconnect_instance,
@@ -146,18 +149,62 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan)
atomic_read(&server->num_waiters));
}
+static inline const char *smb_speed_to_str(size_t bps)
+{
+ size_t mbps = bps / 1000 / 1000;
+
+ switch (mbps) {
+ case SPEED_10:
+ return "10Mbps";
+ case SPEED_100:
+ return "100Mbps";
+ case SPEED_1000:
+ return "1Gbps";
+ case SPEED_2500:
+ return "2.5Gbps";
+ case SPEED_5000:
+ return "5Gbps";
+ case SPEED_10000:
+ return "10Gbps";
+ case SPEED_14000:
+ return "14Gbps";
+ case SPEED_20000:
+ return "20Gbps";
+ case SPEED_25000:
+ return "25Gbps";
+ case SPEED_40000:
+ return "40Gbps";
+ case SPEED_50000:
+ return "50Gbps";
+ case SPEED_56000:
+ return "56Gbps";
+ case SPEED_100000:
+ return "100Gbps";
+ case SPEED_200000:
+ return "200Gbps";
+ case SPEED_400000:
+ return "400Gbps";
+ case SPEED_800000:
+ return "800Gbps";
+ default:
+ return "Unknown";
+ }
+}
+
static void
cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
{
struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr;
struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr;
- seq_printf(m, "\tSpeed: %zu bps\n", iface->speed);
+ seq_printf(m, "\tSpeed: %s\n", smb_speed_to_str(iface->speed));
seq_puts(m, "\t\tCapabilities: ");
if (iface->rdma_capable)
seq_puts(m, "rdma ");
if (iface->rss_capable)
seq_puts(m, "rss ");
+ if (!iface->rdma_capable && !iface->rss_capable)
+ seq_puts(m, "None");
seq_putc(m, '\n');
if (iface->sockaddr.ss_family == AF_INET)
seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr);
@@ -350,8 +397,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
atomic_read(&server->smbd_conn->mr_used_count));
skip_rdma:
#endif
- seq_printf(m, "\nNumber of credits: %d Dialect 0x%x",
- server->credits, server->dialect);
+ seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x",
+ server->credits,
+ server->echo_credits,
+ server->oplock_credits,
+ server->dialect);
if (server->compress_algorithm == SMB3_COMPRESS_LZNT1)
seq_printf(m, " COMPRESS_LZNT1");
else if (server->compress_algorithm == SMB3_COMPRESS_LZ77)
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 0d84bb1a8cd9..b212a4e16b39 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -970,43 +970,6 @@ release_iface(struct kref *ref)
kfree(iface);
}
-/*
- * compare two interfaces a and b
- * return 0 if everything matches.
- * return 1 if a has higher link speed, or rdma capable, or rss capable
- * return -1 otherwise.
- */
-static inline int
-iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b)
-{
- int cmp_ret = 0;
-
- WARN_ON(!a || !b);
- if (a->speed == b->speed) {
- if (a->rdma_capable == b->rdma_capable) {
- if (a->rss_capable == b->rss_capable) {
- cmp_ret = memcmp(&a->sockaddr, &b->sockaddr,
- sizeof(a->sockaddr));
- if (!cmp_ret)
- return 0;
- else if (cmp_ret > 0)
- return 1;
- else
- return -1;
- } else if (a->rss_capable > b->rss_capable)
- return 1;
- else
- return -1;
- } else if (a->rdma_capable > b->rdma_capable)
- return 1;
- else
- return -1;
- } else if (a->speed > b->speed)
- return 1;
- else
- return -1;
-}
-
struct cifs_chan {
unsigned int in_reconnect : 1; /* if session setup in progress for this channel */
struct TCP_Server_Info *server;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index c1c704990b98..d127aded2f28 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -87,6 +87,7 @@ extern int cifs_handle_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
extern int smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx);
extern int smb3_parse_opt(const char *options, const char *key, char **val);
+extern int cifs_ipaddr_cmp(struct sockaddr *srcaddr, struct sockaddr *rhs);
extern bool cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs);
extern int cifs_discard_remaining_data(struct TCP_Server_Info *server);
extern int cifs_call_async(struct TCP_Server_Info *server,
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 8e9a672320ab..9d16626e7a66 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -1288,6 +1288,56 @@ next_pdu:
module_put_and_kthread_exit(0);
}
+int
+cifs_ipaddr_cmp(struct sockaddr *srcaddr, struct sockaddr *rhs)
+{
+ struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
+ struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
+ struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
+ struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
+
+ switch (srcaddr->sa_family) {
+ case AF_UNSPEC:
+ switch (rhs->sa_family) {
+ case AF_UNSPEC:
+ return 0;
+ case AF_INET:
+ case AF_INET6:
+ return 1;
+ default:
+ return -1;
+ }
+ case AF_INET: {
+ switch (rhs->sa_family) {
+ case AF_UNSPEC:
+ return -1;
+ case AF_INET:
+ return memcmp(saddr4, vaddr4,
+ sizeof(struct sockaddr_in));
+ case AF_INET6:
+ return 1;
+ default:
+ return -1;
+ }
+ }
+ case AF_INET6: {
+ switch (rhs->sa_family) {
+ case AF_UNSPEC:
+ case AF_INET:
+ return -1;
+ case AF_INET6:
+ return memcmp(saddr6,
+ vaddr6,
+ sizeof(struct sockaddr_in6));
+ default:
+ return -1;
+ }
+ }
+ default:
+ return -1; /* don't expect to be here */
+ }
+}
+
/*
* Returns true if srcaddr isn't specified and rhs isn't specified, or
* if srcaddr is specified and matches the IP address of the rhs argument
@@ -4086,16 +4136,17 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&tcon->tc_lock);
+ if (tcon->status == TID_GOOD) {
+ spin_unlock(&tcon->tc_lock);
+ return 0;
+ }
+
if (tcon->status != TID_NEW &&
tcon->status != TID_NEED_TCON) {
spin_unlock(&tcon->tc_lock);
return -EHOSTDOWN;
}
- if (tcon->status == TID_GOOD) {
- spin_unlock(&tcon->tc_lock);
- return 0;
- }
tcon->status = TID_IN_TCON;
spin_unlock(&tcon->tc_lock);
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 2f93bf8c3325..2390b2fedd6a 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -575,16 +575,17 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&tcon->tc_lock);
+ if (tcon->status == TID_GOOD) {
+ spin_unlock(&tcon->tc_lock);
+ return 0;
+ }
+
if (tcon->status != TID_NEW &&
tcon->status != TID_NEED_TCON) {
spin_unlock(&tcon->tc_lock);
return -EHOSTDOWN;
}
- if (tcon->status == TID_GOOD) {
- spin_unlock(&tcon->tc_lock);
- return 0;
- }
tcon->status = TID_IN_TCON;
spin_unlock(&tcon->tc_lock);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index df88b8c04d03..051283386e22 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -4942,9 +4942,13 @@ oplock_break_ack:
* disconnected since oplock already released by the server
*/
if (!oplock_break_cancelled) {
- rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid,
+ /* check for server null since can race with kill_sb calling tree disconnect */
+ if (tcon->ses && tcon->ses->server) {
+ rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid,
volatile_fid, net_fid, cinode);
- cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
+ cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
+ } else
+ pr_warn_once("lease break not sent for unmounted share\n");
}
cifs_done_oplock_break(cinode);
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 6e3be58cfe49..a8bb9d00d33a 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -34,6 +34,8 @@ static int
change_conf(struct TCP_Server_Info *server)
{
server->credits += server->echo_credits + server->oplock_credits;
+ if (server->credits > server->max_credits)
+ server->credits = server->max_credits;
server->oplock_credits = server->echo_credits = 0;
switch (server->credits) {
case 0:
@@ -91,6 +93,7 @@ smb2_add_credits(struct TCP_Server_Info *server,
server->conn_id, server->hostname, *val,
add, server->in_flight);
}
+ WARN_ON_ONCE(server->in_flight == 0);
server->in_flight--;
if (server->in_flight == 0 &&
((optype & CIFS_OP_MASK) != CIFS_NEG_OP) &&
@@ -510,6 +513,43 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
return rsize;
}
+/*
+ * compare two interfaces a and b
+ * return 0 if everything matches.
+ * return 1 if a is rdma capable, or rss capable, or has higher link speed
+ * return -1 otherwise.
+ */
+static int
+iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b)
+{
+ int cmp_ret = 0;
+
+ WARN_ON(!a || !b);
+ if (a->rdma_capable == b->rdma_capable) {
+ if (a->rss_capable == b->rss_capable) {
+ if (a->speed == b->speed) {
+ cmp_ret = cifs_ipaddr_cmp((struct sockaddr *) &a->sockaddr,
+ (struct sockaddr *) &b->sockaddr);
+ if (!cmp_ret)
+ return 0;
+ else if (cmp_ret > 0)
+ return 1;
+ else
+ return -1;
+ } else if (a->speed > b->speed)
+ return 1;
+ else
+ return -1;
+ } else if (a->rss_capable > b->rss_capable)
+ return 1;
+ else
+ return -1;
+ } else if (a->rdma_capable > b->rdma_capable)
+ return 1;
+ else
+ return -1;
+}
+
static int
parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
size_t buf_len, struct cifs_ses *ses, bool in_mount)
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 7063b395d22f..17fe212ab895 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -1305,7 +1305,12 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
}
/* enough to enable echos and oplocks and one max size write */
- req->hdr.CreditRequest = cpu_to_le16(130);
+ if (server->credits >= server->max_credits)
+ req->hdr.CreditRequest = cpu_to_le16(0);
+ else
+ req->hdr.CreditRequest = cpu_to_le16(
+ min_t(int, server->max_credits -
+ server->credits, 130));
/* only one of SMB2 signing flags may be set in SMB2 request */
if (server->sign)
@@ -1899,7 +1904,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
rqst.rq_nvec = 2;
/* Need 64 for max size write so ask for more in case not there yet */
- req->hdr.CreditRequest = cpu_to_le16(64);
+ if (server->credits >= server->max_credits)
+ req->hdr.CreditRequest = cpu_to_le16(0);
+ else
+ req->hdr.CreditRequest = cpu_to_le16(
+ min_t(int, server->max_credits -
+ server->credits, 64));
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -4227,6 +4237,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
struct TCP_Server_Info *server;
struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
unsigned int total_len;
+ int credit_request;
cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
__func__, rdata->offset, rdata->bytes);
@@ -4258,7 +4269,13 @@ smb2_async_readv(struct cifs_readdata *rdata)
if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8);
+ credit_request = le16_to_cpu(shdr->CreditCharge) + 8;
+ if (server->credits >= server->max_credits)
+ shdr->CreditRequest = cpu_to_le16(0);
+ else
+ shdr->CreditRequest = cpu_to_le16(
+ min_t(int, server->max_credits -
+ server->credits, credit_request));
rc = adjust_credits(server, &rdata->credits, rdata->bytes);
if (rc)
@@ -4468,6 +4485,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
unsigned int total_len;
struct cifs_io_parms _io_parms;
struct cifs_io_parms *io_parms = NULL;
+ int credit_request;
if (!wdata->server)
server = wdata->server = cifs_pick_channel(tcon->ses);
@@ -4572,7 +4590,13 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (wdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8);
+ credit_request = le16_to_cpu(shdr->CreditCharge) + 8;
+ if (server->credits >= server->max_credits)
+ shdr->CreditRequest = cpu_to_le16(0);
+ else
+ shdr->CreditRequest = cpu_to_le16(
+ min_t(int, server->max_credits -
+ server->credits, credit_request));
rc = adjust_credits(server, &wdata->credits, io_parms->length);
if (rc)
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 24bdd5f4d3bc..0474d0bba0a2 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -55,7 +55,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
temp->pid = current->pid;
temp->command = cpu_to_le16(smb_buffer->Command);
cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
- /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
+ /* easier to use jiffies */
/* when mid allocated can be before when sent */
temp->when_alloc = jiffies;
temp->server = server;
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 4882a812ea86..2a717d158f02 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -294,6 +294,9 @@ bool ksmbd_conn_alive(struct ksmbd_conn *conn)
return true;
}
+#define SMB1_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb_hdr))
+#define SMB2_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr) + 4)
+
/**
* ksmbd_conn_handler_loop() - session thread to listen on new smb requests
* @p: connection instance
@@ -350,6 +353,9 @@ int ksmbd_conn_handler_loop(void *p)
if (pdu_size > MAX_STREAM_PROT_LEN)
break;
+ if (pdu_size < SMB1_MIN_SUPPORTED_HEADER_SIZE)
+ break;
+
/* 4 for rfc1002 length field */
/* 1 for implied bcc[0] */
size = pdu_size + 4 + 1;
@@ -358,8 +364,6 @@ int ksmbd_conn_handler_loop(void *p)
break;
memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf));
- if (!ksmbd_smb_request(conn))
- break;
/*
* We already read 4 bytes to find out PDU size, now
@@ -377,6 +381,15 @@ int ksmbd_conn_handler_loop(void *p)
continue;
}
+ if (!ksmbd_smb_request(conn))
+ break;
+
+ if (((struct smb2_hdr *)smb2_get_msg(conn->request_buf))->ProtocolId ==
+ SMB2_PROTO_NUMBER) {
+ if (pdu_size < SMB2_MIN_SUPPORTED_HEADER_SIZE)
+ break;
+ }
+
if (!default_conn_ops.process_fn) {
pr_err("No connection request callback\n");
break;
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index db181bdad73a..844b303baf29 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -1415,56 +1415,38 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
*/
struct lease_ctx_info *parse_lease_state(void *open_req)
{
- char *data_offset;
struct create_context *cc;
- unsigned int next = 0;
- char *name;
- bool found = false;
struct smb2_create_req *req = (struct smb2_create_req *)open_req;
- struct lease_ctx_info *lreq = kzalloc(sizeof(struct lease_ctx_info),
- GFP_KERNEL);
+ struct lease_ctx_info *lreq;
+
+ cc = smb2_find_context_vals(req, SMB2_CREATE_REQUEST_LEASE, 4);
+ if (IS_ERR_OR_NULL(cc))
+ return NULL;
+
+ lreq = kzalloc(sizeof(struct lease_ctx_info), GFP_KERNEL);
if (!lreq)
return NULL;
- data_offset = (char *)req + le32_to_cpu(req->CreateContextsOffset);
- cc = (struct create_context *)data_offset;
- do {
- cc = (struct create_context *)((char *)cc + next);
- name = le16_to_cpu(cc->NameOffset) + (char *)cc;
- if (le16_to_cpu(cc->NameLength) != 4 ||
- strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4)) {
- next = le32_to_cpu(cc->Next);
- continue;
- }
- found = true;
- break;
- } while (next != 0);
+ if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) {
+ struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
- if (found) {
- if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) {
- struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
-
- memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
- lreq->req_state = lc->lcontext.LeaseState;
- lreq->flags = lc->lcontext.LeaseFlags;
- lreq->duration = lc->lcontext.LeaseDuration;
- memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey,
- SMB2_LEASE_KEY_SIZE);
- lreq->version = 2;
- } else {
- struct create_lease *lc = (struct create_lease *)cc;
+ memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
+ lreq->req_state = lc->lcontext.LeaseState;
+ lreq->flags = lc->lcontext.LeaseFlags;
+ lreq->duration = lc->lcontext.LeaseDuration;
+ memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey,
+ SMB2_LEASE_KEY_SIZE);
+ lreq->version = 2;
+ } else {
+ struct create_lease *lc = (struct create_lease *)cc;
- memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
- lreq->req_state = lc->lcontext.LeaseState;
- lreq->flags = lc->lcontext.LeaseFlags;
- lreq->duration = lc->lcontext.LeaseDuration;
- lreq->version = 1;
- }
- return lreq;
+ memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
+ lreq->req_state = lc->lcontext.LeaseState;
+ lreq->flags = lc->lcontext.LeaseFlags;
+ lreq->duration = lc->lcontext.LeaseDuration;
+ lreq->version = 1;
}
-
- kfree(lreq);
- return NULL;
+ return lreq;
}
/**
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index f9b2e0f19b03..ced7a9e916f0 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -185,24 +185,31 @@ static void __handle_ksmbd_work(struct ksmbd_work *work,
goto send;
}
- if (conn->ops->check_user_session) {
- rc = conn->ops->check_user_session(work);
- if (rc < 0) {
- command = conn->ops->get_cmd_val(work);
- conn->ops->set_rsp_status(work,
- STATUS_USER_SESSION_DELETED);
- goto send;
- } else if (rc > 0) {
- rc = conn->ops->get_ksmbd_tcon(work);
+ do {
+ if (conn->ops->check_user_session) {
+ rc = conn->ops->check_user_session(work);
if (rc < 0) {
- conn->ops->set_rsp_status(work,
- STATUS_NETWORK_NAME_DELETED);
+ if (rc == -EINVAL)
+ conn->ops->set_rsp_status(work,
+ STATUS_INVALID_PARAMETER);
+ else
+ conn->ops->set_rsp_status(work,
+ STATUS_USER_SESSION_DELETED);
goto send;
+ } else if (rc > 0) {
+ rc = conn->ops->get_ksmbd_tcon(work);
+ if (rc < 0) {
+ if (rc == -EINVAL)
+ conn->ops->set_rsp_status(work,
+ STATUS_INVALID_PARAMETER);
+ else
+ conn->ops->set_rsp_status(work,
+ STATUS_NETWORK_NAME_DELETED);
+ goto send;
+ }
}
}
- }
- do {
rc = __process_request(work, conn, &command);
if (rc == SERVER_HANDLER_ABORT)
break;
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index 0ffe663b7590..33b7e6c4ceff 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -351,9 +351,16 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
int command;
__u32 clc_len; /* calculated length */
__u32 len = get_rfc1002_len(work->request_buf);
+ __u32 req_struct_size, next_cmd = le32_to_cpu(hdr->NextCommand);
- if (le32_to_cpu(hdr->NextCommand) > 0)
- len = le32_to_cpu(hdr->NextCommand);
+ if ((u64)work->next_smb2_rcv_hdr_off + next_cmd > len) {
+ pr_err("next command(%u) offset exceeds smb msg size\n",
+ next_cmd);
+ return 1;
+ }
+
+ if (next_cmd > 0)
+ len = next_cmd;
else if (work->next_smb2_rcv_hdr_off)
len -= work->next_smb2_rcv_hdr_off;
@@ -373,17 +380,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
}
if (smb2_req_struct_sizes[command] != pdu->StructureSize2) {
- if (command != SMB2_OPLOCK_BREAK_HE &&
- (hdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) {
- /* error packets have 9 byte structure size */
- ksmbd_debug(SMB,
- "Illegal request size %u for command %d\n",
- le16_to_cpu(pdu->StructureSize2), command);
- return 1;
- } else if (command == SMB2_OPLOCK_BREAK_HE &&
- hdr->Status == 0 &&
- le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 &&
- le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) {
+ if (command == SMB2_OPLOCK_BREAK_HE &&
+ le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 &&
+ le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) {
/* special case for SMB2.1 lease break message */
ksmbd_debug(SMB,
"Illegal request size %d for oplock break\n",
@@ -392,6 +391,14 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
}
}
+ req_struct_size = le16_to_cpu(pdu->StructureSize2) +
+ __SMB2_HEADER_STRUCTURE_SIZE;
+ if (command == SMB2_LOCK_HE)
+ req_struct_size -= sizeof(struct smb2_lock_element);
+
+ if (req_struct_size > len + 1)
+ return 1;
+
if (smb2_calc_size(hdr, &clc_len))
return 1;
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 7a81541de602..da1787c68ba0 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -91,7 +91,6 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work)
unsigned int cmd = le16_to_cpu(req_hdr->Command);
int tree_id;
- work->tcon = NULL;
if (cmd == SMB2_TREE_CONNECT_HE ||
cmd == SMB2_CANCEL_HE ||
cmd == SMB2_LOGOFF_HE) {
@@ -105,10 +104,28 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work)
}
tree_id = le32_to_cpu(req_hdr->Id.SyncId.TreeId);
+
+ /*
+ * If request is not the first in Compound request,
+ * Just validate tree id in header with work->tcon->id.
+ */
+ if (work->next_smb2_rcv_hdr_off) {
+ if (!work->tcon) {
+ pr_err("The first operation in the compound does not have tcon\n");
+ return -EINVAL;
+ }
+ if (work->tcon->id != tree_id) {
+ pr_err("tree id(%u) is different with id(%u) in first operation\n",
+ tree_id, work->tcon->id);
+ return -EINVAL;
+ }
+ return 1;
+ }
+
work->tcon = ksmbd_tree_conn_lookup(work->sess, tree_id);
if (!work->tcon) {
pr_err("Invalid tid %d\n", tree_id);
- return -EINVAL;
+ return -ENOENT;
}
return 1;
@@ -547,7 +564,6 @@ int smb2_check_user_session(struct ksmbd_work *work)
unsigned int cmd = conn->ops->get_cmd_val(work);
unsigned long long sess_id;
- work->sess = NULL;
/*
* SMB2_ECHO, SMB2_NEGOTIATE, SMB2_SESSION_SETUP command do not
* require a session id, so no need to validate user session's for
@@ -558,15 +574,33 @@ int smb2_check_user_session(struct ksmbd_work *work)
return 0;
if (!ksmbd_conn_good(conn))
- return -EINVAL;
+ return -EIO;
sess_id = le64_to_cpu(req_hdr->SessionId);
+
+ /*
+ * If request is not the first in Compound request,
+ * Just validate session id in header with work->sess->id.
+ */
+ if (work->next_smb2_rcv_hdr_off) {
+ if (!work->sess) {
+ pr_err("The first operation in the compound does not have sess\n");
+ return -EINVAL;
+ }
+ if (work->sess->id != sess_id) {
+ pr_err("session id(%llu) is different with the first operation(%lld)\n",
+ sess_id, work->sess->id);
+ return -EINVAL;
+ }
+ return 1;
+ }
+
/* Check for validity of user session */
work->sess = ksmbd_session_lookup_all(conn, sess_id);
if (work->sess)
return 1;
ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id);
- return -EINVAL;
+ return -ENOENT;
}
static void destroy_previous_session(struct ksmbd_conn *conn,
@@ -963,13 +997,13 @@ static void decode_sign_cap_ctxt(struct ksmbd_conn *conn,
static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn,
struct smb2_negotiate_req *req,
- int len_of_smb)
+ unsigned int len_of_smb)
{
/* +4 is to account for the RFC1001 len field */
struct smb2_neg_context *pctx = (struct smb2_neg_context *)req;
int i = 0, len_of_ctxts;
- int offset = le32_to_cpu(req->NegotiateContextOffset);
- int neg_ctxt_cnt = le16_to_cpu(req->NegotiateContextCount);
+ unsigned int offset = le32_to_cpu(req->NegotiateContextOffset);
+ unsigned int neg_ctxt_cnt = le16_to_cpu(req->NegotiateContextCount);
__le32 status = STATUS_INVALID_PARAMETER;
ksmbd_debug(SMB, "decoding %d negotiate contexts\n", neg_ctxt_cnt);
@@ -983,7 +1017,7 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn,
while (i++ < neg_ctxt_cnt) {
int clen, ctxt_len;
- if (len_of_ctxts < sizeof(struct smb2_neg_context))
+ if (len_of_ctxts < (int)sizeof(struct smb2_neg_context))
break;
pctx = (struct smb2_neg_context *)((char *)pctx + offset);
@@ -1038,9 +1072,8 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn,
}
/* offsets must be 8 byte aligned */
- clen = (clen + 7) & ~0x7;
- offset = clen + sizeof(struct smb2_neg_context);
- len_of_ctxts -= clen + sizeof(struct smb2_neg_context);
+ offset = (ctxt_len + 7) & ~0x7;
+ len_of_ctxts -= offset;
}
return status;
}
@@ -2250,7 +2283,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
/* delete the EA only when it exits */
if (rc > 0) {
rc = ksmbd_vfs_remove_xattr(idmap,
- path->dentry,
+ path,
attr_name);
if (rc < 0) {
@@ -2264,8 +2297,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
/* if the EA doesn't exist, just do nothing. */
rc = 0;
} else {
- rc = ksmbd_vfs_setxattr(idmap,
- path->dentry, attr_name, value,
+ rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value,
le16_to_cpu(eabuf->EaValueLength), 0);
if (rc < 0) {
ksmbd_debug(SMB,
@@ -2322,8 +2354,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path,
return -EBADF;
}
- rc = ksmbd_vfs_setxattr(idmap, path->dentry,
- xattr_stream_name, NULL, 0, 0);
+ rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0);
if (rc < 0)
pr_err("Failed to store XATTR stream name :%d\n", rc);
return 0;
@@ -2351,7 +2382,7 @@ static int smb2_remove_smb_xattrs(const struct path *path)
if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
!strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX,
STREAM_PREFIX_LEN)) {
- err = ksmbd_vfs_remove_xattr(idmap, path->dentry,
+ err = ksmbd_vfs_remove_xattr(idmap, path,
name);
if (err)
ksmbd_debug(SMB, "remove xattr failed : %s\n",
@@ -2398,8 +2429,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path *
da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME |
XATTR_DOSINFO_ITIME;
- rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt),
- path->dentry, &da);
+ rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da);
if (rc)
ksmbd_debug(SMB, "failed to store file attribute into xattr\n");
}
@@ -2973,7 +3003,7 @@ int smb2_open(struct ksmbd_work *work)
struct inode *inode = d_inode(path.dentry);
posix_acl_rc = ksmbd_vfs_inherit_posix_acl(idmap,
- path.dentry,
+ &path,
d_inode(path.dentry->d_parent));
if (posix_acl_rc)
ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc);
@@ -2989,7 +3019,7 @@ int smb2_open(struct ksmbd_work *work)
if (rc) {
if (posix_acl_rc)
ksmbd_vfs_set_init_posix_acl(idmap,
- path.dentry);
+ &path);
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_ACL_XATTR)) {
@@ -3029,7 +3059,7 @@ int smb2_open(struct ksmbd_work *work)
rc = ksmbd_vfs_set_sd_xattr(conn,
idmap,
- path.dentry,
+ &path,
pntsd,
pntsd_size);
kfree(pntsd);
@@ -5465,7 +5495,7 @@ static int smb2_rename(struct ksmbd_work *work,
goto out;
rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp),
- fp->filp->f_path.dentry,
+ &fp->filp->f_path,
xattr_stream_name,
NULL, 0, 0);
if (rc < 0) {
@@ -5630,8 +5660,7 @@ static int set_file_basic_info(struct ksmbd_file *fp,
da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME |
XATTR_DOSINFO_ITIME;
- rc = ksmbd_vfs_set_dos_attrib_xattr(idmap,
- filp->f_path.dentry, &da);
+ rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da);
if (rc)
ksmbd_debug(SMB,
"failed to restore file attribute in EA\n");
@@ -7486,7 +7515,7 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id,
da.attr = le32_to_cpu(fp->f_ci->m_fattr);
ret = ksmbd_vfs_set_dos_attrib_xattr(idmap,
- fp->filp->f_path.dentry, &da);
+ &fp->filp->f_path, &da);
if (ret)
fp->f_ci->m_fattr = old_fattr;
}
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index af0c2a9b8529..569e5eecdf3d 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -158,7 +158,19 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work)
*/
bool ksmbd_smb_request(struct ksmbd_conn *conn)
{
- return conn->request_buf[0] == 0;
+ __le32 *proto = (__le32 *)smb2_get_msg(conn->request_buf);
+
+ if (*proto == SMB2_COMPRESSION_TRANSFORM_ID) {
+ pr_err_ratelimited("smb2 compression not support yet");
+ return false;
+ }
+
+ if (*proto != SMB1_PROTO_NUMBER &&
+ *proto != SMB2_PROTO_NUMBER &&
+ *proto != SMB2_TRANSFORM_PROTO_NUM)
+ return false;
+
+ return true;
}
static bool supported_protocol(int idx)
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 6d6cfb6957a9..ad919a4239d0 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -1162,8 +1162,7 @@ pass:
pntsd_size += sizeof(struct smb_acl) + nt_size;
}
- ksmbd_vfs_set_sd_xattr(conn, idmap,
- path->dentry, pntsd, pntsd_size);
+ ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size);
kfree(pntsd);
}
@@ -1290,7 +1289,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
- if (posix_acls && !found) {
+ if (!IS_ERR_OR_NULL(posix_acls) && !found) {
unsigned int id = -1;
pa_entry = posix_acls->a_entries;
@@ -1314,7 +1313,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
}
}
}
- if (posix_acls)
+ if (!IS_ERR_OR_NULL(posix_acls))
posix_acl_release(posix_acls);
}
@@ -1383,7 +1382,7 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
newattrs.ia_valid |= ATTR_MODE;
newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777);
- ksmbd_vfs_remove_acl_xattrs(idmap, path->dentry);
+ ksmbd_vfs_remove_acl_xattrs(idmap, path);
/* Update posix acls */
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) {
rc = set_posix_acl(idmap, path->dentry,
@@ -1414,9 +1413,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) {
/* Update WinACL in xattr */
- ksmbd_vfs_remove_sd_xattrs(idmap, path->dentry);
- ksmbd_vfs_set_sd_xattr(conn, idmap,
- path->dentry, pntsd, ntsd_len);
+ ksmbd_vfs_remove_sd_xattrs(idmap, path);
+ ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len);
}
out:
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 6f302919e9f7..81489fdedd8e 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -170,6 +170,10 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
return err;
}
+ err = mnt_want_write(path.mnt);
+ if (err)
+ goto out_err;
+
mode |= S_IFREG;
err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry),
dentry, mode, true);
@@ -179,6 +183,9 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
} else {
pr_err("File(%s): creation failed (err:%d)\n", name, err);
}
+ mnt_drop_write(path.mnt);
+
+out_err:
done_path_create(&path, dentry);
return err;
}
@@ -209,30 +216,35 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
return err;
}
+ err = mnt_want_write(path.mnt);
+ if (err)
+ goto out_err2;
+
idmap = mnt_idmap(path.mnt);
mode |= S_IFDIR;
err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
- if (err) {
- goto out;
- } else if (d_unhashed(dentry)) {
+ if (!err && d_unhashed(dentry)) {
struct dentry *d;
d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent,
dentry->d_name.len);
if (IS_ERR(d)) {
err = PTR_ERR(d);
- goto out;
+ goto out_err1;
}
if (unlikely(d_is_negative(d))) {
dput(d);
err = -ENOENT;
- goto out;
+ goto out_err1;
}
ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d));
dput(d);
}
-out:
+
+out_err1:
+ mnt_drop_write(path.mnt);
+out_err2:
done_path_create(&path, dentry);
if (err)
pr_err("mkdir(%s): creation failed (err:%d)\n", name, err);
@@ -443,7 +455,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
memcpy(&stream_buf[*pos], buf, count);
err = ksmbd_vfs_setxattr(idmap,
- fp->filp->f_path.dentry,
+ &fp->filp->f_path,
fp->stream.name,
(void *)stream_buf,
size,
@@ -589,6 +601,10 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
goto out_err;
}
+ err = mnt_want_write(path->mnt);
+ if (err)
+ goto out_err;
+
idmap = mnt_idmap(path->mnt);
if (S_ISDIR(d_inode(path->dentry)->i_mode)) {
err = vfs_rmdir(idmap, d_inode(parent), path->dentry);
@@ -599,6 +615,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
if (err)
ksmbd_debug(VFS, "unlink failed, err %d\n", err);
}
+ mnt_drop_write(path->mnt);
out_err:
ksmbd_revert_fsids(work);
@@ -644,11 +661,16 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname,
goto out3;
}
+ err = mnt_want_write(newpath.mnt);
+ if (err)
+ goto out3;
+
err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt),
d_inode(newpath.dentry),
dentry, NULL);
if (err)
ksmbd_debug(VFS, "vfs_link failed err %d\n", err);
+ mnt_drop_write(newpath.mnt);
out3:
done_path_create(&newpath, dentry);
@@ -694,6 +716,10 @@ retry:
goto out2;
}
+ err = mnt_want_write(old_path->mnt);
+ if (err)
+ goto out2;
+
trap = lock_rename_child(old_child, new_path.dentry);
old_parent = dget(old_child->d_parent);
@@ -757,6 +783,7 @@ out4:
out3:
dput(old_parent);
unlock_rename(old_parent, new_path.dentry);
+ mnt_drop_write(old_path->mnt);
out2:
path_put(&new_path);
@@ -897,19 +924,24 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap,
* Return: 0 on success, otherwise error
*/
int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
- struct dentry *dentry, const char *attr_name,
+ const struct path *path, const char *attr_name,
void *attr_value, size_t attr_size, int flags)
{
int err;
+ err = mnt_want_write(path->mnt);
+ if (err)
+ return err;
+
err = vfs_setxattr(idmap,
- dentry,
+ path->dentry,
attr_name,
attr_value,
attr_size,
flags);
if (err)
ksmbd_debug(VFS, "setxattr failed, err %d\n", err);
+ mnt_drop_write(path->mnt);
return err;
}
@@ -1013,9 +1045,18 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
}
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
- struct dentry *dentry, char *attr_name)
+ const struct path *path, char *attr_name)
{
- return vfs_removexattr(idmap, dentry, attr_name);
+ int err;
+
+ err = mnt_want_write(path->mnt);
+ if (err)
+ return err;
+
+ err = vfs_removexattr(idmap, path->dentry, attr_name);
+ mnt_drop_write(path->mnt);
+
+ return err;
}
int ksmbd_vfs_unlink(struct file *filp)
@@ -1024,6 +1065,10 @@ int ksmbd_vfs_unlink(struct file *filp)
struct dentry *dir, *dentry = filp->f_path.dentry;
struct mnt_idmap *idmap = file_mnt_idmap(filp);
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
dir = dget_parent(dentry);
err = ksmbd_vfs_lock_parent(dir, dentry);
if (err)
@@ -1041,6 +1086,7 @@ int ksmbd_vfs_unlink(struct file *filp)
ksmbd_debug(VFS, "failed to delete, err %d\n", err);
out:
dput(dir);
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
@@ -1244,13 +1290,13 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
}
int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap,
- struct dentry *dentry)
+ const struct path *path)
{
char *name, *xattr_list = NULL;
ssize_t xattr_list_len;
int err = 0;
- xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list);
+ xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list);
if (xattr_list_len < 0) {
goto out;
} else if (!xattr_list_len) {
@@ -1258,6 +1304,10 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap,
goto out;
}
+ err = mnt_want_write(path->mnt);
+ if (err)
+ goto out;
+
for (name = xattr_list; name - xattr_list < xattr_list_len;
name += strlen(name) + 1) {
ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
@@ -1266,25 +1316,26 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap,
sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) ||
!strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) {
- err = vfs_remove_acl(idmap, dentry, name);
+ err = vfs_remove_acl(idmap, path->dentry, name);
if (err)
ksmbd_debug(SMB,
"remove acl xattr failed : %s\n", name);
}
}
+ mnt_drop_write(path->mnt);
+
out:
kvfree(xattr_list);
return err;
}
-int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap,
- struct dentry *dentry)
+int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path)
{
char *name, *xattr_list = NULL;
ssize_t xattr_list_len;
int err = 0;
- xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list);
+ xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list);
if (xattr_list_len < 0) {
goto out;
} else if (!xattr_list_len) {
@@ -1297,7 +1348,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap,
ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) {
- err = ksmbd_vfs_remove_xattr(idmap, dentry, name);
+ err = ksmbd_vfs_remove_xattr(idmap, path, name);
if (err)
ksmbd_debug(SMB, "remove xattr failed : %s\n", name);
}
@@ -1321,7 +1372,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct mnt_idmap *id
return NULL;
posix_acls = get_inode_acl(inode, acl_type);
- if (!posix_acls)
+ if (IS_ERR_OR_NULL(posix_acls))
return NULL;
smb_acl = kzalloc(sizeof(struct xattr_smb_acl) +
@@ -1374,13 +1425,14 @@ out:
int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
struct mnt_idmap *idmap,
- struct dentry *dentry,
+ const struct path *path,
struct smb_ntsd *pntsd, int len)
{
int rc;
struct ndr sd_ndr = {0}, acl_ndr = {0};
struct xattr_ntacl acl = {0};
struct xattr_smb_acl *smb_acl, *def_smb_acl = NULL;
+ struct dentry *dentry = path->dentry;
struct inode *inode = d_inode(dentry);
acl.version = 4;
@@ -1432,7 +1484,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
goto out;
}
- rc = ksmbd_vfs_setxattr(idmap, dentry,
+ rc = ksmbd_vfs_setxattr(idmap, path,
XATTR_NAME_SD, sd_ndr.data,
sd_ndr.offset, 0);
if (rc < 0)
@@ -1522,7 +1574,7 @@ free_n_data:
}
int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap,
- struct dentry *dentry,
+ const struct path *path,
struct xattr_dos_attrib *da)
{
struct ndr n;
@@ -1532,7 +1584,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap,
if (err)
return err;
- err = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE,
+ err = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_DOS_ATTRIBUTE,
(void *)n.data, n.offset, 0);
if (err)
ksmbd_debug(SMB, "failed to store dos attribute in xattr\n");
@@ -1769,10 +1821,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock)
}
int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
- struct dentry *dentry)
+ struct path *path)
{
struct posix_acl_state acl_state;
struct posix_acl *acls;
+ struct dentry *dentry = path->dentry;
struct inode *inode = d_inode(dentry);
int rc;
@@ -1802,6 +1855,11 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
return -ENOMEM;
}
posix_state_to_acl(&acl_state, acls->a_entries);
+
+ rc = mnt_want_write(path->mnt);
+ if (rc)
+ goto out_err;
+
rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
@@ -1813,16 +1871,20 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
rc);
}
+ mnt_drop_write(path->mnt);
+
+out_err:
free_acl_state(&acl_state);
posix_acl_release(acls);
return rc;
}
int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *parent_inode)
+ struct path *path, struct inode *parent_inode)
{
struct posix_acl *acls;
struct posix_acl_entry *pace;
+ struct dentry *dentry = path->dentry;
struct inode *inode = d_inode(dentry);
int rc, i;
@@ -1830,7 +1892,7 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
return -EOPNOTSUPP;
acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT);
- if (!acls)
+ if (IS_ERR_OR_NULL(acls))
return -ENOENT;
pace = acls->a_entries;
@@ -1841,6 +1903,10 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
}
}
+ rc = mnt_want_write(path->mnt);
+ if (rc)
+ goto out_err;
+
rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
@@ -1852,6 +1918,9 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
rc);
}
+ mnt_drop_write(path->mnt);
+
+out_err:
posix_acl_release(acls);
return rc;
}
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index a4ae89f3230d..8c0931d4d531 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -108,12 +108,12 @@ ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap,
struct dentry *dentry, char *attr_name,
int attr_name_len);
int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
- struct dentry *dentry, const char *attr_name,
+ const struct path *path, const char *attr_name,
void *attr_value, size_t attr_size, int flags);
int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
size_t *xattr_stream_name_size, int s_type);
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
- struct dentry *dentry, char *attr_name);
+ const struct path *path, char *attr_name);
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
unsigned int flags, struct path *path,
bool caseless);
@@ -139,26 +139,25 @@ void ksmbd_vfs_posix_lock_wait(struct file_lock *flock);
int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout);
void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock);
int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap,
- struct dentry *dentry);
-int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap,
- struct dentry *dentry);
+ const struct path *path);
+int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path);
int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
struct mnt_idmap *idmap,
- struct dentry *dentry,
+ const struct path *path,
struct smb_ntsd *pntsd, int len);
int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn,
struct mnt_idmap *idmap,
struct dentry *dentry,
struct smb_ntsd **pntsd);
int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap,
- struct dentry *dentry,
+ const struct path *path,
struct xattr_dos_attrib *da);
int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap,
struct dentry *dentry,
struct xattr_dos_attrib *da);
int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
- struct dentry *dentry);
+ struct path *path);
int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
- struct dentry *dentry,
+ struct path *path,
struct inode *parent_inode);
#endif /* __KSMBD_VFS_H__ */
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 2d0138e72d78..f41f8d6108ce 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -252,7 +252,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) {
ci->m_flags &= ~S_DEL_ON_CLS_STREAM;
err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp),
- filp->f_path.dentry,
+ &filp->f_path,
fp->stream.name);
if (err)
pr_err("remove xattr failed : %s\n",
diff --git a/fs/super.c b/fs/super.c
index 34afe411cf2b..48c29954d487 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
+ * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
@@ -595,7 +595,7 @@ retry:
fc->s_fs_info = NULL;
s->s_type = fc->fs_type;
s->s_iflags |= fc->s_iflags;
- strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
+ strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
list_add_tail(&s->s_list, &super_blocks);
hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
spin_unlock(&sb_lock);
@@ -674,7 +674,7 @@ retry:
return ERR_PTR(err);
}
s->s_type = type;
- strlcpy(s->s_id, type->name, sizeof(s->s_id));
+ strscpy(s->s_id, type->name, sizeof(s->s_id));
list_add_tail(&s->s_list, &super_blocks);
hlist_add_head(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
@@ -903,6 +903,7 @@ int reconfigure_super(struct fs_context *fc)
struct super_block *sb = fc->root->d_sb;
int retval;
bool remount_ro = false;
+ bool remount_rw = false;
bool force = fc->sb_flags & SB_FORCE;
if (fc->sb_flags_mask & ~MS_RMT_MASK)
@@ -920,7 +921,7 @@ int reconfigure_super(struct fs_context *fc)
bdev_read_only(sb->s_bdev))
return -EACCES;
#endif
-
+ remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb);
remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
}
@@ -943,13 +944,18 @@ int reconfigure_super(struct fs_context *fc)
*/
if (remount_ro) {
if (force) {
- sb->s_readonly_remount = 1;
- smp_wmb();
+ sb_start_ro_state_change(sb);
} else {
retval = sb_prepare_remount_readonly(sb);
if (retval)
return retval;
}
+ } else if (remount_rw) {
+ /*
+ * Protect filesystem's reconfigure code from writes from
+ * userspace until reconfigure finishes.
+ */
+ sb_start_ro_state_change(sb);
}
if (fc->ops->reconfigure) {
@@ -965,9 +971,7 @@ int reconfigure_super(struct fs_context *fc)
WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
(fc->sb_flags & fc->sb_flags_mask)));
- /* Needs to be ordered wrt mnt_is_readonly() */
- smp_wmb();
- sb->s_readonly_remount = 0;
+ sb_end_ro_state_change(sb);
/*
* Some filesystems modify their metadata via some other path than the
@@ -982,7 +986,7 @@ int reconfigure_super(struct fs_context *fc)
return 0;
cancel_readonly:
- sb->s_readonly_remount = 0;
+ sb_end_ro_state_change(sb);
return retval;
}
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index cdb3d632c63d..0140010aa0c3 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -52,7 +52,7 @@ static int sysv_handle_dirsync(struct inode *dir)
}
/*
- * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the
+ * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the
* rules documented in mm/highmem.rst.
*
* NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page()
@@ -103,11 +103,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
fs16_to_cpu(SYSV_SB(sb), de->inode),
DT_UNKNOWN)) {
- put_and_unmap_page(page, kaddr);
+ unmap_and_put_page(page, kaddr);
return 0;
}
}
- put_and_unmap_page(page, kaddr);
+ unmap_and_put_page(page, kaddr);
}
return 0;
}
@@ -131,7 +131,7 @@ static inline int namecompare(int len, int maxlen,
* itself (as a parameter - res_dir). It does NOT read the inode of the
* entry - you'll have to do that yourself if you want to.
*
- * On Success put_and_unmap_page() should be called on *res_page.
+ * On Success unmap_and_put_page() should be called on *res_page.
*
* sysv_find_entry() acts as a call to dir_get_page() and must be treated
* accordingly for nesting purposes.
@@ -166,7 +166,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
name, de->name))
goto found;
}
- put_and_unmap_page(page, kaddr);
+ unmap_and_put_page(page, kaddr);
}
if (++n >= npages)
@@ -209,7 +209,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode)
goto out_page;
de++;
}
- put_and_unmap_page(page, kaddr);
+ unmap_and_put_page(page, kaddr);
}
BUG();
return -EINVAL;
@@ -228,7 +228,7 @@ got_it:
mark_inode_dirty(dir);
err = sysv_handle_dirsync(dir);
out_page:
- put_and_unmap_page(page, kaddr);
+ unmap_and_put_page(page, kaddr);
return err;
out_unlock:
unlock_page(page);
@@ -321,12 +321,12 @@ int sysv_empty_dir(struct inode * inode)
if (de->name[1] != '.' || de->name[2])
goto not_empty;
}
- put_and_unmap_page(page, kaddr);
+ unmap_and_put_page(page, kaddr);
}
return 1;
not_empty:
- put_and_unmap_page(page, kaddr);
+ unmap_and_put_page(page, kaddr);
return 0;
}
@@ -352,7 +352,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page,
}
/*
- * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the
+ * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the
* rules documented in mm/highmem.rst.
*
* sysv_dotdot() acts as a call to dir_get_page() and must be treated
@@ -376,7 +376,7 @@ ino_t sysv_inode_by_name(struct dentry *dentry)
if (de) {
res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode);
- put_and_unmap_page(page, de);
+ unmap_and_put_page(page, de);
}
return res;
}
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index b22764fe669c..58d7f43a1371 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -145,6 +145,10 @@ static int alloc_branch(struct inode *inode,
*/
parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key);
bh = sb_getblk(inode->i_sb, parent);
+ if (!bh) {
+ sysv_free_block(inode->i_sb, branch[n].key);
+ break;
+ }
lock_buffer(bh);
memset(bh->b_data, 0, blocksize);
branch[n].bh = bh;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 2b2dba4c4f56..fcf163fea3ad 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -164,7 +164,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry)
inode->i_ctime = dir->i_ctime;
inode_dec_link_count(inode);
}
- put_and_unmap_page(page, de);
+ unmap_and_put_page(page, de);
return err;
}
@@ -227,7 +227,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (!new_de)
goto out_dir;
err = sysv_set_link(new_de, new_page, old_inode);
- put_and_unmap_page(new_page, new_de);
+ unmap_and_put_page(new_page, new_de);
if (err)
goto out_dir;
new_inode->i_ctime = current_time(new_inode);
@@ -256,9 +256,9 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
out_dir:
if (dir_de)
- put_and_unmap_page(dir_page, dir_de);
+ unmap_and_put_page(dir_page, dir_de);
out_old:
- put_and_unmap_page(old_page, old_de);
+ unmap_and_put_page(old_page, old_de);
out:
return err;
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index fd20423d3ed2..fd29a66e7241 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -793,11 +793,6 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (!empty_dir(new_inode))
goto out_oiter;
}
- /*
- * We need to protect against old_inode getting converted from
- * ICB to normal directory.
- */
- inode_lock_nested(old_inode, I_MUTEX_NONDIR2);
retval = udf_fiiter_find_entry(old_inode, &dotdot_name,
&diriter);
if (retval == -ENOENT) {
@@ -806,10 +801,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
old_inode->i_ino);
retval = -EFSCORRUPTED;
}
- if (retval) {
- inode_unlock(old_inode);
+ if (retval)
goto out_oiter;
- }
has_diriter = true;
tloc = lelb_to_cpu(diriter.fi.icb.extLocation);
if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
@@ -889,7 +882,6 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
udf_dir_entry_len(&diriter.fi));
udf_fiiter_write_fi(&diriter, NULL);
udf_fiiter_release(&diriter);
- inode_unlock(old_inode);
inode_dec_link_count(old_dir);
if (new_inode)
@@ -901,10 +893,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
return 0;
out_oiter:
- if (has_diriter) {
+ if (has_diriter)
udf_fiiter_release(&diriter);
- inode_unlock(old_inode);
- }
udf_fiiter_release(&oiter);
return retval;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0fd96d6e39ce..4e800bb7d2ab 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool basic_ioctls;
unsigned long start, end, vma_end;
struct vma_iterator vmi;
+ pgoff_t pgoff;
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1459,6 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma_iter_set(&vmi, start);
prev = vma_prev(&vmi);
+ if (vma->vm_start < start)
+ prev = vma;
ret = 0;
for_each_vma_range(vmi, vma, end) {
@@ -1482,8 +1485,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma_end = min(end, vma->vm_end);
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma->anon_vma, vma->vm_file, pgoff,
vma_policy(vma),
((struct vm_userfaultfd_ctx){ ctx }),
anon_vma_name(vma));
@@ -1563,6 +1567,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
struct vma_iterator vmi;
+ pgoff_t pgoff;
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1625,6 +1630,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
vma_iter_set(&vmi, start);
prev = vma_prev(&vmi);
+ if (vma->vm_start < start)
+ prev = vma;
+
ret = 0;
for_each_vma_range(vmi, vma, end) {
cond_resched();
@@ -1662,8 +1670,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma->anon_vma, vma->vm_file, pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev) {
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
index a7ffd718f171..e1036e535352 100644
--- a/fs/verity/Kconfig
+++ b/fs/verity/Kconfig
@@ -39,14 +39,14 @@ config FS_VERITY_BUILTIN_SIGNATURES
depends on FS_VERITY
select SYSTEM_DATA_VERIFICATION
help
- Support verifying signatures of verity files against the X.509
- certificates that have been loaded into the ".fs-verity"
- kernel keyring.
+ This option adds support for in-kernel verification of
+ fs-verity builtin signatures.
- This is meant as a relatively simple mechanism that can be
- used to provide an authenticity guarantee for verity files, as
- an alternative to IMA appraisal. Userspace programs still
- need to check that the verity bit is set in order to get an
- authenticity guarantee.
+ Please take great care before using this feature. It is not
+ the only way to do signatures with fs-verity, and the
+ alternatives (such as userspace signature verification, and
+ IMA appraisal) can be much better. For details about the
+ limitations of this feature, see
+ Documentation/filesystems/fsverity.rst.
If unsure, say N.
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index fc4c50e5219d..c284f46d1b53 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -7,6 +7,7 @@
#include "fsverity_private.h"
+#include <crypto/hash.h>
#include <linux/mount.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
@@ -20,7 +21,7 @@ struct block_buffer {
/* Hash a block, writing the result to the next level's pending block buffer. */
static int hash_one_block(struct inode *inode,
const struct merkle_tree_params *params,
- struct ahash_request *req, struct block_buffer *cur)
+ struct block_buffer *cur)
{
struct block_buffer *next = cur + 1;
int err;
@@ -36,8 +37,7 @@ static int hash_one_block(struct inode *inode,
/* Zero-pad the block if it's shorter than the block size. */
memset(&cur->data[cur->filled], 0, params->block_size - cur->filled);
- err = fsverity_hash_block(params, inode, req, virt_to_page(cur->data),
- offset_in_page(cur->data),
+ err = fsverity_hash_block(params, inode, cur->data,
&next->data[next->filled]);
if (err)
return err;
@@ -76,7 +76,6 @@ static int build_merkle_tree(struct file *filp,
struct inode *inode = file_inode(filp);
const u64 data_size = inode->i_size;
const int num_levels = params->num_levels;
- struct ahash_request *req;
struct block_buffer _buffers[1 + FS_VERITY_MAX_LEVELS + 1] = {};
struct block_buffer *buffers = &_buffers[1];
unsigned long level_offset[FS_VERITY_MAX_LEVELS];
@@ -90,9 +89,6 @@ static int build_merkle_tree(struct file *filp,
return 0;
}
- /* This allocation never fails, since it's mempool-backed. */
- req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL);
-
/*
* Allocate the block buffers. Buffer "-1" is for data blocks.
* Buffers 0 <= level < num_levels are for the actual tree levels.
@@ -130,7 +126,7 @@ static int build_merkle_tree(struct file *filp,
fsverity_err(inode, "Short read of file data");
goto out;
}
- err = hash_one_block(inode, params, req, &buffers[-1]);
+ err = hash_one_block(inode, params, &buffers[-1]);
if (err)
goto out;
for (level = 0; level < num_levels; level++) {
@@ -141,8 +137,7 @@ static int build_merkle_tree(struct file *filp,
}
/* Next block at @level is full */
- err = hash_one_block(inode, params, req,
- &buffers[level]);
+ err = hash_one_block(inode, params, &buffers[level]);
if (err)
goto out;
err = write_merkle_tree_block(inode,
@@ -162,8 +157,7 @@ static int build_merkle_tree(struct file *filp,
/* Finish all nonempty pending tree blocks. */
for (level = 0; level < num_levels; level++) {
if (buffers[level].filled != 0) {
- err = hash_one_block(inode, params, req,
- &buffers[level]);
+ err = hash_one_block(inode, params, &buffers[level]);
if (err)
goto out;
err = write_merkle_tree_block(inode,
@@ -183,7 +177,6 @@ static int build_merkle_tree(struct file *filp,
out:
for (level = -1; level < num_levels; level++)
kfree(buffers[level].data);
- fsverity_free_hash_request(params->hash_alg, req);
return err;
}
@@ -215,7 +208,7 @@ static int enable_verity(struct file *filp,
}
desc->salt_size = arg->salt_size;
- /* Get the signature if the user provided one */
+ /* Get the builtin signature if the user provided one */
if (arg->sig_size &&
copy_from_user(desc->signature, u64_to_user_ptr(arg->sig_ptr),
arg->sig_size)) {
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index d34dcc033d72..49bf3a1eb2a0 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -11,9 +11,6 @@
#define pr_fmt(fmt) "fs-verity: " fmt
#include <linux/fsverity.h>
-#include <linux/mempool.h>
-
-struct ahash_request;
/*
* Implementation limit: maximum depth of the Merkle tree. For now 8 is plenty;
@@ -23,11 +20,10 @@ struct ahash_request;
/* A hash algorithm supported by fs-verity */
struct fsverity_hash_alg {
- struct crypto_ahash *tfm; /* hash tfm, allocated on demand */
+ struct crypto_shash *tfm; /* hash tfm, allocated on demand */
const char *name; /* crypto API name, e.g. sha256 */
unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */
- mempool_t req_pool; /* mempool with a preallocated hash request */
/*
* The HASH_ALGO_* constant for this algorithm. This is different from
* FS_VERITY_HASH_ALG_*, which uses a different numbering scheme.
@@ -37,7 +33,7 @@ struct fsverity_hash_alg {
/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
struct merkle_tree_params {
- struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
+ const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
const u8 *hashstate; /* initial hash state or NULL */
unsigned int digest_size; /* same as hash_alg->digest_size */
unsigned int block_size; /* size of data and tree blocks */
@@ -83,18 +79,13 @@ struct fsverity_info {
extern struct fsverity_hash_alg fsverity_hash_algs[];
-struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
- unsigned int num);
-struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg,
- gfp_t gfp_flags);
-void fsverity_free_hash_request(struct fsverity_hash_alg *alg,
- struct ahash_request *req);
-const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg,
+const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+ unsigned int num);
+const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
const u8 *salt, size_t salt_size);
int fsverity_hash_block(const struct merkle_tree_params *params,
- const struct inode *inode, struct ahash_request *req,
- struct page *page, unsigned int offset, u8 *out);
-int fsverity_hash_buffer(struct fsverity_hash_alg *alg,
+ const struct inode *inode, const void *data, u8 *out);
+int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
const void *data, size_t size, u8 *out);
void __init fsverity_check_hash_algs(void);
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index ea00dbedf756..c598d2035476 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -8,7 +8,6 @@
#include "fsverity_private.h"
#include <crypto/hash.h>
-#include <linux/scatterlist.h>
/* The hash algorithms supported by fs-verity */
struct fsverity_hash_alg fsverity_hash_algs[] = {
@@ -40,11 +39,11 @@ static DEFINE_MUTEX(fsverity_hash_alg_init_mutex);
*
* Return: pointer to the hash alg on success, else an ERR_PTR()
*/
-struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
- unsigned int num)
+const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+ unsigned int num)
{
struct fsverity_hash_alg *alg;
- struct crypto_ahash *tfm;
+ struct crypto_shash *tfm;
int err;
if (num >= ARRAY_SIZE(fsverity_hash_algs) ||
@@ -63,11 +62,7 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
if (alg->tfm != NULL)
goto out_unlock;
- /*
- * Using the shash API would make things a bit simpler, but the ahash
- * API is preferable as it allows the use of crypto accelerators.
- */
- tfm = crypto_alloc_ahash(alg->name, 0, 0);
+ tfm = crypto_alloc_shash(alg->name, 0, 0);
if (IS_ERR(tfm)) {
if (PTR_ERR(tfm) == -ENOENT) {
fsverity_warn(inode,
@@ -84,26 +79,20 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
}
err = -EINVAL;
- if (WARN_ON_ONCE(alg->digest_size != crypto_ahash_digestsize(tfm)))
+ if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm)))
goto err_free_tfm;
- if (WARN_ON_ONCE(alg->block_size != crypto_ahash_blocksize(tfm)))
- goto err_free_tfm;
-
- err = mempool_init_kmalloc_pool(&alg->req_pool, 1,
- sizeof(struct ahash_request) +
- crypto_ahash_reqsize(tfm));
- if (err)
+ if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm)))
goto err_free_tfm;
pr_info("%s using implementation \"%s\"\n",
- alg->name, crypto_ahash_driver_name(tfm));
+ alg->name, crypto_shash_driver_name(tfm));
/* pairs with smp_load_acquire() above */
smp_store_release(&alg->tfm, tfm);
goto out_unlock;
err_free_tfm:
- crypto_free_ahash(tfm);
+ crypto_free_shash(tfm);
alg = ERR_PTR(err);
out_unlock:
mutex_unlock(&fsverity_hash_alg_init_mutex);
@@ -111,42 +100,6 @@ out_unlock:
}
/**
- * fsverity_alloc_hash_request() - allocate a hash request object
- * @alg: the hash algorithm for which to allocate the request
- * @gfp_flags: memory allocation flags
- *
- * This is mempool-backed, so this never fails if __GFP_DIRECT_RECLAIM is set in
- * @gfp_flags. However, in that case this might need to wait for all
- * previously-allocated requests to be freed. So to avoid deadlocks, callers
- * must never need multiple requests at a time to make forward progress.
- *
- * Return: the request object on success; NULL on failure (but see above)
- */
-struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg,
- gfp_t gfp_flags)
-{
- struct ahash_request *req = mempool_alloc(&alg->req_pool, gfp_flags);
-
- if (req)
- ahash_request_set_tfm(req, alg->tfm);
- return req;
-}
-
-/**
- * fsverity_free_hash_request() - free a hash request object
- * @alg: the hash algorithm
- * @req: the hash request object to free
- */
-void fsverity_free_hash_request(struct fsverity_hash_alg *alg,
- struct ahash_request *req)
-{
- if (req) {
- ahash_request_zero(req);
- mempool_free(req, &alg->req_pool);
- }
-}
-
-/**
* fsverity_prepare_hash_state() - precompute the initial hash state
* @alg: hash algorithm
* @salt: a salt which is to be prepended to all data to be hashed
@@ -155,27 +108,24 @@ void fsverity_free_hash_request(struct fsverity_hash_alg *alg,
* Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed
* initial hash state on success or an ERR_PTR() on failure.
*/
-const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg,
+const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
const u8 *salt, size_t salt_size)
{
u8 *hashstate = NULL;
- struct ahash_request *req = NULL;
+ SHASH_DESC_ON_STACK(desc, alg->tfm);
u8 *padded_salt = NULL;
size_t padded_salt_size;
- struct scatterlist sg;
- DECLARE_CRYPTO_WAIT(wait);
int err;
+ desc->tfm = alg->tfm;
+
if (salt_size == 0)
return NULL;
- hashstate = kmalloc(crypto_ahash_statesize(alg->tfm), GFP_KERNEL);
+ hashstate = kmalloc(crypto_shash_statesize(alg->tfm), GFP_KERNEL);
if (!hashstate)
return ERR_PTR(-ENOMEM);
- /* This allocation never fails, since it's mempool-backed. */
- req = fsverity_alloc_hash_request(alg, GFP_KERNEL);
-
/*
* Zero-pad the salt to the next multiple of the input size of the hash
* algorithm's compression function, e.g. 64 bytes for SHA-256 or 128
@@ -190,26 +140,18 @@ const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg,
goto err_free;
}
memcpy(padded_salt, salt, salt_size);
-
- sg_init_one(&sg, padded_salt, padded_salt_size);
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
- CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &wait);
- ahash_request_set_crypt(req, &sg, NULL, padded_salt_size);
-
- err = crypto_wait_req(crypto_ahash_init(req), &wait);
+ err = crypto_shash_init(desc);
if (err)
goto err_free;
- err = crypto_wait_req(crypto_ahash_update(req), &wait);
+ err = crypto_shash_update(desc, padded_salt, padded_salt_size);
if (err)
goto err_free;
- err = crypto_ahash_export(req, hashstate);
+ err = crypto_shash_export(desc, hashstate);
if (err)
goto err_free;
out:
- fsverity_free_hash_request(alg, req);
kfree(padded_salt);
return hashstate;
@@ -223,9 +165,7 @@ err_free:
* fsverity_hash_block() - hash a single data or hash block
* @params: the Merkle tree's parameters
* @inode: inode for which the hashing is being done
- * @req: preallocated hash request
- * @page: the page containing the block to hash
- * @offset: the offset of the block within @page
+ * @data: virtual address of a buffer containing the block to hash
* @out: output digest, size 'params->digest_size' bytes
*
* Hash a single data or hash block. The hash is salted if a salt is specified
@@ -234,33 +174,24 @@ err_free:
* Return: 0 on success, -errno on failure
*/
int fsverity_hash_block(const struct merkle_tree_params *params,
- const struct inode *inode, struct ahash_request *req,
- struct page *page, unsigned int offset, u8 *out)
+ const struct inode *inode, const void *data, u8 *out)
{
- struct scatterlist sg;
- DECLARE_CRYPTO_WAIT(wait);
+ SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm);
int err;
- sg_init_table(&sg, 1);
- sg_set_page(&sg, page, params->block_size, offset);
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
- CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &wait);
- ahash_request_set_crypt(req, &sg, out, params->block_size);
+ desc->tfm = params->hash_alg->tfm;
if (params->hashstate) {
- err = crypto_ahash_import(req, params->hashstate);
+ err = crypto_shash_import(desc, params->hashstate);
if (err) {
fsverity_err(inode,
"Error %d importing hash state", err);
return err;
}
- err = crypto_ahash_finup(req);
+ err = crypto_shash_finup(desc, data, params->block_size, out);
} else {
- err = crypto_ahash_digest(req);
+ err = crypto_shash_digest(desc, data, params->block_size, out);
}
-
- err = crypto_wait_req(err, &wait);
if (err)
fsverity_err(inode, "Error %d computing block hash", err);
return err;
@@ -273,32 +204,12 @@ int fsverity_hash_block(const struct merkle_tree_params *params,
* @size: size of data to hash, in bytes
* @out: output digest, size 'alg->digest_size' bytes
*
- * Hash some data which is located in physically contiguous memory (i.e. memory
- * allocated by kmalloc(), not by vmalloc()). No salt is used.
- *
* Return: 0 on success, -errno on failure
*/
-int fsverity_hash_buffer(struct fsverity_hash_alg *alg,
+int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
const void *data, size_t size, u8 *out)
{
- struct ahash_request *req;
- struct scatterlist sg;
- DECLARE_CRYPTO_WAIT(wait);
- int err;
-
- /* This allocation never fails, since it's mempool-backed. */
- req = fsverity_alloc_hash_request(alg, GFP_KERNEL);
-
- sg_init_one(&sg, data, size);
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
- CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &wait);
- ahash_request_set_crypt(req, &sg, out, size);
-
- err = crypto_wait_req(crypto_ahash_digest(req), &wait);
-
- fsverity_free_hash_request(alg, req);
- return err;
+ return crypto_shash_tfm_digest(alg->tfm, data, size, out);
}
void __init fsverity_check_hash_algs(void)
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index 5c79ea1b2468..eec5956141da 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -61,27 +61,42 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
/**
* fsverity_get_digest() - get a verity file's digest
* @inode: inode to get digest of
- * @digest: (out) pointer to the digest
- * @alg: (out) pointer to the hash algorithm enumeration
+ * @raw_digest: (out) the raw file digest
+ * @alg: (out) the digest's algorithm, as a FS_VERITY_HASH_ALG_* value
+ * @halg: (out) the digest's algorithm, as a HASH_ALGO_* value
*
- * Return the file hash algorithm and digest of an fsverity protected file.
- * Assumption: before calling this, the file must have been opened.
+ * Retrieves the fsverity digest of the given file. The file must have been
+ * opened at least once since the inode was last loaded into the inode cache;
+ * otherwise this function will not recognize when fsverity is enabled.
*
- * Return: 0 on success, -errno on failure
+ * The file's fsverity digest consists of @raw_digest in combination with either
+ * @alg or @halg. (The caller can choose which one of @alg or @halg to use.)
+ *
+ * IMPORTANT: Callers *must* make use of one of the two algorithm IDs, since
+ * @raw_digest is meaningless without knowing which algorithm it uses! fsverity
+ * provides no security guarantee for users who ignore the algorithm ID, even if
+ * they use the digest size (since algorithms can share the same digest size).
+ *
+ * Return: The size of the raw digest in bytes, or 0 if the file doesn't have
+ * fsverity enabled.
*/
int fsverity_get_digest(struct inode *inode,
- u8 digest[FS_VERITY_MAX_DIGEST_SIZE],
- enum hash_algo *alg)
+ u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE],
+ u8 *alg, enum hash_algo *halg)
{
const struct fsverity_info *vi;
const struct fsverity_hash_alg *hash_alg;
vi = fsverity_get_info(inode);
if (!vi)
- return -ENODATA; /* not a verity file */
+ return 0; /* not a verity file */
hash_alg = vi->tree_params.hash_alg;
- memcpy(digest, vi->file_digest, hash_alg->digest_size);
- *alg = hash_alg->algo_id;
- return 0;
+ memcpy(raw_digest, vi->file_digest, hash_alg->digest_size);
+ if (alg)
+ *alg = hash_alg - fsverity_hash_algs;
+ if (halg)
+ *halg = hash_alg->algo_id;
+ return hash_alg->digest_size;
}
+EXPORT_SYMBOL_GPL(fsverity_get_digest);
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 52048b7630dc..1db5106a9c38 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -32,7 +32,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
unsigned int log_blocksize,
const u8 *salt, size_t salt_size)
{
- struct fsverity_hash_alg *hash_alg;
+ const struct fsverity_hash_alg *hash_alg;
int err;
u64 blocks;
u64 blocks_in_level[FS_VERITY_MAX_LEVELS];
@@ -156,9 +156,9 @@ out_err:
/*
* Compute the file digest by hashing the fsverity_descriptor excluding the
- * signature and with the sig_size field set to 0.
+ * builtin signature and with the sig_size field set to 0.
*/
-static int compute_file_digest(struct fsverity_hash_alg *hash_alg,
+static int compute_file_digest(const struct fsverity_hash_alg *hash_alg,
struct fsverity_descriptor *desc,
u8 *file_digest)
{
@@ -174,7 +174,7 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg,
/*
* Create a new fsverity_info from the given fsverity_descriptor (with optional
- * appended signature), and check the signature if present. The
+ * appended builtin signature), and check the signature if present. The
* fsverity_descriptor must have already undergone basic validation.
*/
struct fsverity_info *fsverity_create_info(const struct inode *inode,
@@ -319,8 +319,8 @@ static bool validate_fsverity_descriptor(struct inode *inode,
}
/*
- * Read the inode's fsverity_descriptor (with optional appended signature) from
- * the filesystem, and do basic validation of it.
+ * Read the inode's fsverity_descriptor (with optional appended builtin
+ * signature) from the filesystem, and do basic validation of it.
*/
int fsverity_get_descriptor(struct inode *inode,
struct fsverity_descriptor **desc_ret)
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index 2aefc5565152..f58432772d9e 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -105,7 +105,7 @@ static int fsverity_read_descriptor(struct inode *inode,
if (res)
return res;
- /* don't include the signature */
+ /* don't include the builtin signature */
desc_size = offsetof(struct fsverity_descriptor, signature);
desc->sig_size = 0;
@@ -131,7 +131,7 @@ static int fsverity_read_signature(struct inode *inode,
}
/*
- * Include only the signature. Note that fsverity_get_descriptor()
+ * Include only the builtin signature. fsverity_get_descriptor()
* already verified that sig_size is in-bounds.
*/
res = fsverity_read_buffer(buf, offset, length, desc->signature,
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
index b8c51ad40d3a..72034bc71c9d 100644
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@@ -5,6 +5,14 @@
* Copyright 2019 Google LLC
*/
+/*
+ * This file implements verification of fs-verity builtin signatures. Please
+ * take great care before using this feature. It is not the only way to do
+ * signatures with fs-verity, and the alternatives (such as userspace signature
+ * verification, and IMA appraisal) can be much better. For details about the
+ * limitations of this feature, see Documentation/filesystems/fsverity.rst.
+ */
+
#include "fsverity_private.h"
#include <linux/cred.h>
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index e2508222750b..433cef51f5f6 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -12,38 +12,6 @@
static struct workqueue_struct *fsverity_read_workqueue;
-static inline int cmp_hashes(const struct fsverity_info *vi,
- const u8 *want_hash, const u8 *real_hash,
- u64 data_pos, int level)
-{
- const unsigned int hsize = vi->tree_params.digest_size;
-
- if (memcmp(want_hash, real_hash, hsize) == 0)
- return 0;
-
- fsverity_err(vi->inode,
- "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
- data_pos, level,
- vi->tree_params.hash_alg->name, hsize, want_hash,
- vi->tree_params.hash_alg->name, hsize, real_hash);
- return -EBADMSG;
-}
-
-static bool data_is_zeroed(struct inode *inode, struct page *page,
- unsigned int len, unsigned int offset)
-{
- void *virt = kmap_local_page(page);
-
- if (memchr_inv(virt + offset, 0, len)) {
- kunmap_local(virt);
- fsverity_err(inode,
- "FILE CORRUPTED! Data past EOF is not zeroed");
- return false;
- }
- kunmap_local(virt);
- return true;
-}
-
/*
* Returns true if the hash block with index @hblock_idx in the tree, located in
* @hpage, has already been verified.
@@ -122,9 +90,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
*/
static bool
verify_data_block(struct inode *inode, struct fsverity_info *vi,
- struct ahash_request *req, struct page *data_page,
- u64 data_pos, unsigned int dblock_offset_in_page,
- unsigned long max_ra_pages)
+ const void *data, u64 data_pos, unsigned long max_ra_pages)
{
const struct merkle_tree_params *params = &vi->tree_params;
const unsigned int hsize = params->digest_size;
@@ -136,11 +102,11 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
struct {
/* Page containing the hash block */
struct page *page;
+ /* Mapped address of the hash block (will be within @page) */
+ const void *addr;
/* Index of the hash block in the tree overall */
unsigned long index;
- /* Byte offset of the hash block within @page */
- unsigned int offset_in_page;
- /* Byte offset of the wanted hash within @page */
+ /* Byte offset of the wanted hash relative to @addr */
unsigned int hoffset;
} hblocks[FS_VERITY_MAX_LEVELS];
/*
@@ -148,7 +114,9 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
* index of that block's hash within the current level.
*/
u64 hidx = data_pos >> params->log_blocksize;
- int err;
+
+ /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */
+ BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX);
if (unlikely(data_pos >= inode->i_size)) {
/*
@@ -159,8 +127,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
* any part past EOF should be all zeroes. Therefore, we need
* to verify that any data blocks fully past EOF are all zeroes.
*/
- return data_is_zeroed(inode, data_page, params->block_size,
- dblock_offset_in_page);
+ if (memchr_inv(data, 0, params->block_size)) {
+ fsverity_err(inode,
+ "FILE CORRUPTED! Data past EOF is not zeroed");
+ return false;
+ }
+ return true;
}
/*
@@ -175,6 +147,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
unsigned int hblock_offset_in_page;
unsigned int hoffset;
struct page *hpage;
+ const void *haddr;
/*
* The index of the block in the current level; also the index
@@ -192,30 +165,30 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
hblock_offset_in_page =
(hblock_idx << params->log_blocksize) & ~PAGE_MASK;
- /* Byte offset of the hash within the page */
- hoffset = hblock_offset_in_page +
- ((hidx << params->log_digestsize) &
- (params->block_size - 1));
+ /* Byte offset of the hash within the block */
+ hoffset = (hidx << params->log_digestsize) &
+ (params->block_size - 1);
hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
hpage_idx, level == 0 ? min(max_ra_pages,
params->tree_pages - hpage_idx) : 0);
if (IS_ERR(hpage)) {
- err = PTR_ERR(hpage);
fsverity_err(inode,
- "Error %d reading Merkle tree page %lu",
- err, hpage_idx);
- goto out;
+ "Error %ld reading Merkle tree page %lu",
+ PTR_ERR(hpage), hpage_idx);
+ goto error;
}
+ haddr = kmap_local_page(hpage) + hblock_offset_in_page;
if (is_hash_block_verified(vi, hpage, hblock_idx)) {
- memcpy_from_page(_want_hash, hpage, hoffset, hsize);
+ memcpy(_want_hash, haddr + hoffset, hsize);
want_hash = _want_hash;
+ kunmap_local(haddr);
put_page(hpage);
goto descend;
}
hblocks[level].page = hpage;
+ hblocks[level].addr = haddr;
hblocks[level].index = hblock_idx;
- hblocks[level].offset_in_page = hblock_offset_in_page;
hblocks[level].hoffset = hoffset;
hidx = next_hidx;
}
@@ -225,18 +198,14 @@ descend:
/* Descend the tree verifying hash blocks. */
for (; level > 0; level--) {
struct page *hpage = hblocks[level - 1].page;
+ const void *haddr = hblocks[level - 1].addr;
unsigned long hblock_idx = hblocks[level - 1].index;
- unsigned int hblock_offset_in_page =
- hblocks[level - 1].offset_in_page;
unsigned int hoffset = hblocks[level - 1].hoffset;
- err = fsverity_hash_block(params, inode, req, hpage,
- hblock_offset_in_page, real_hash);
- if (err)
- goto out;
- err = cmp_hashes(vi, want_hash, real_hash, data_pos, level - 1);
- if (err)
- goto out;
+ if (fsverity_hash_block(params, inode, haddr, real_hash) != 0)
+ goto error;
+ if (memcmp(want_hash, real_hash, hsize) != 0)
+ goto corrupted;
/*
* Mark the hash block as verified. This must be atomic and
* idempotent, as the same hash block might be verified by
@@ -246,29 +215,39 @@ descend:
set_bit(hblock_idx, vi->hash_block_verified);
else
SetPageChecked(hpage);
- memcpy_from_page(_want_hash, hpage, hoffset, hsize);
+ memcpy(_want_hash, haddr + hoffset, hsize);
want_hash = _want_hash;
+ kunmap_local(haddr);
put_page(hpage);
}
/* Finally, verify the data block. */
- err = fsverity_hash_block(params, inode, req, data_page,
- dblock_offset_in_page, real_hash);
- if (err)
- goto out;
- err = cmp_hashes(vi, want_hash, real_hash, data_pos, -1);
-out:
- for (; level > 0; level--)
- put_page(hblocks[level - 1].page);
+ if (fsverity_hash_block(params, inode, data, real_hash) != 0)
+ goto error;
+ if (memcmp(want_hash, real_hash, hsize) != 0)
+ goto corrupted;
+ return true;
- return err == 0;
+corrupted:
+ fsverity_err(inode,
+ "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
+ data_pos, level - 1,
+ params->hash_alg->name, hsize, want_hash,
+ params->hash_alg->name, hsize, real_hash);
+error:
+ for (; level > 0; level--) {
+ kunmap_local(hblocks[level - 1].addr);
+ put_page(hblocks[level - 1].page);
+ }
+ return false;
}
static bool
-verify_data_blocks(struct inode *inode, struct fsverity_info *vi,
- struct ahash_request *req, struct folio *data_folio,
- size_t len, size_t offset, unsigned long max_ra_pages)
+verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
+ unsigned long max_ra_pages)
{
+ struct inode *inode = data_folio->mapping->host;
+ struct fsverity_info *vi = inode->i_verity_info;
const unsigned int block_size = vi->tree_params.block_size;
u64 pos = (u64)data_folio->index << PAGE_SHIFT;
@@ -278,11 +257,14 @@ verify_data_blocks(struct inode *inode, struct fsverity_info *vi,
folio_test_uptodate(data_folio)))
return false;
do {
- struct page *data_page =
- folio_page(data_folio, offset >> PAGE_SHIFT);
-
- if (!verify_data_block(inode, vi, req, data_page, pos + offset,
- offset & ~PAGE_MASK, max_ra_pages))
+ void *data;
+ bool valid;
+
+ data = kmap_local_folio(data_folio, offset);
+ valid = verify_data_block(inode, vi, data, pos + offset,
+ max_ra_pages);
+ kunmap_local(data);
+ if (!valid)
return false;
offset += block_size;
len -= block_size;
@@ -304,19 +286,7 @@ verify_data_blocks(struct inode *inode, struct fsverity_info *vi,
*/
bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
{
- struct inode *inode = folio->mapping->host;
- struct fsverity_info *vi = inode->i_verity_info;
- struct ahash_request *req;
- bool valid;
-
- /* This allocation never fails, since it's mempool-backed. */
- req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS);
-
- valid = verify_data_blocks(inode, vi, req, folio, len, offset, 0);
-
- fsverity_free_hash_request(vi->tree_params.hash_alg, req);
-
- return valid;
+ return verify_data_blocks(folio, len, offset, 0);
}
EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
@@ -337,15 +307,9 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
*/
void fsverity_verify_bio(struct bio *bio)
{
- struct inode *inode = bio_first_page_all(bio)->mapping->host;
- struct fsverity_info *vi = inode->i_verity_info;
- struct ahash_request *req;
struct folio_iter fi;
unsigned long max_ra_pages = 0;
- /* This allocation never fails, since it's mempool-backed. */
- req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS);
-
if (bio->bi_opf & REQ_RAHEAD) {
/*
* If this bio is for data readahead, then we also do readahead
@@ -360,14 +324,12 @@ void fsverity_verify_bio(struct bio *bio)
}
bio_for_each_folio_all(fi, bio) {
- if (!verify_data_blocks(inode, vi, req, fi.folio, fi.length,
- fi.offset, max_ra_pages)) {
+ if (!verify_data_blocks(fi.folio, fi.length, fi.offset,
+ max_ra_pages)) {
bio->bi_status = BLK_STS_IOERR;
break;
}
}
-
- fsverity_free_hash_request(vi->tree_params.hash_alg, req);
}
EXPORT_SYMBOL_GPL(fsverity_verify_bio);
#endif /* CONFIG_BLOCK */
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 9b373a0c7aaf..ee84835ebc66 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -984,7 +984,10 @@ xfs_ag_shrink_space(
if (err2 != -ENOSPC)
goto resv_err;
- __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
+ err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
+ true);
+ if (err2)
+ goto resv_err;
/*
* Roll the transaction before trying to re-init the per-ag
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index fdfa08cbf4db..c20fe99405d8 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -628,6 +628,25 @@ xfs_alloc_fixup_trees(
return 0;
}
+/*
+ * We do not verify the AGFL contents against AGF-based index counters here,
+ * even though we may have access to the perag that contains shadow copies. We
+ * don't know if the AGF based counters have been checked, and if they have they
+ * still may be inconsistent because they haven't yet been reset on the first
+ * allocation after the AGF has been read in.
+ *
+ * This means we can only check that all agfl entries contain valid or null
+ * values because we can't reliably determine the active range to exclude
+ * NULLAGBNO as a valid value.
+ *
+ * However, we can't even do that for v4 format filesystems because there are
+ * old versions of mkfs out there that does not initialise the AGFL to known,
+ * verifiable values. HEnce we can't tell the difference between a AGFL block
+ * allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
+ *
+ * As a result, we can only fully validate AGFL block numbers when we pull them
+ * from the freelist in xfs_alloc_get_freelist().
+ */
static xfs_failaddr_t
xfs_agfl_verify(
struct xfs_buf *bp)
@@ -637,12 +656,6 @@ xfs_agfl_verify(
__be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
int i;
- /*
- * There is no verification of non-crc AGFLs because mkfs does not
- * initialise the AGFL to zero or NULL. Hence the only valid part of the
- * AGFL is what the AGF says is active. We can't get to the AGF, so we
- * can't verify just those entries are valid.
- */
if (!xfs_has_crc(mp))
return NULL;
@@ -2321,12 +2334,16 @@ xfs_free_agfl_block(
}
/*
- * Check the agfl fields of the agf for inconsistency or corruption. The purpose
- * is to detect an agfl header padding mismatch between current and early v5
- * kernels. This problem manifests as a 1-slot size difference between the
- * on-disk flcount and the active [first, last] range of a wrapped agfl. This
- * may also catch variants of agfl count corruption unrelated to padding. Either
- * way, we'll reset the agfl and warn the user.
+ * Check the agfl fields of the agf for inconsistency or corruption.
+ *
+ * The original purpose was to detect an agfl header padding mismatch between
+ * current and early v5 kernels. This problem manifests as a 1-slot size
+ * difference between the on-disk flcount and the active [first, last] range of
+ * a wrapped agfl.
+ *
+ * However, we need to use these same checks to catch agfl count corruptions
+ * unrelated to padding. This could occur on any v4 or v5 filesystem, so either
+ * way, we need to reset the agfl and warn the user.
*
* Return true if a reset is required before the agfl can be used, false
* otherwise.
@@ -2342,10 +2359,6 @@ xfs_agfl_needs_reset(
int agfl_size = xfs_agfl_size(mp);
int active;
- /* no agfl header on v4 supers */
- if (!xfs_has_crc(mp))
- return false;
-
/*
* The agf read verifier catches severe corruption of these fields.
* Repeat some sanity checks to cover a packed -> unpacked mismatch if
@@ -2418,7 +2431,7 @@ xfs_agfl_reset(
* the real allocation can proceed. Deferring the free disconnects freeing up
* the AGFL slot from freeing the block.
*/
-STATIC void
+static int
xfs_defer_agfl_block(
struct xfs_trans *tp,
xfs_agnumber_t agno,
@@ -2437,17 +2450,21 @@ xfs_defer_agfl_block(
xefi->xefi_blockcount = 1;
xefi->xefi_owner = oinfo->oi_owner;
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, xefi->xefi_startblock)))
+ return -EFSCORRUPTED;
+
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
xfs_extent_free_get_group(mp, xefi);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
+ return 0;
}
/*
* Add the extent to the list of extents to be free at transaction end.
* The list is maintained sorted (by block number).
*/
-void
+int
__xfs_free_extent_later(
struct xfs_trans *tp,
xfs_fsblock_t bno,
@@ -2474,6 +2491,9 @@ __xfs_free_extent_later(
#endif
ASSERT(xfs_extfree_item_cache != NULL);
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
+ return -EFSCORRUPTED;
+
xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
xefi->xefi_startblock = bno;
@@ -2497,6 +2517,7 @@ __xfs_free_extent_later(
xfs_extent_free_get_group(mp, xefi);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
+ return 0;
}
#ifdef DEBUG
@@ -2657,7 +2678,9 @@ xfs_alloc_fix_freelist(
goto out_agbp_relse;
/* defer agfl frees */
- xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+ error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+ if (error)
+ goto out_agbp_relse;
}
targs.tp = tp;
@@ -2767,6 +2790,9 @@ xfs_alloc_get_freelist(
*/
agfl_bno = xfs_buf_to_agfl_bno(agflbp);
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+ if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
+ return -EFSCORRUPTED;
+
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
@@ -2889,6 +2915,19 @@ xfs_alloc_put_freelist(
return 0;
}
+/*
+ * Verify the AGF is consistent.
+ *
+ * We do not verify the AGFL indexes in the AGF are fully consistent here
+ * because of issues with variable on-disk structure sizes. Instead, we check
+ * the agfl indexes for consistency when we initialise the perag from the AGF
+ * information after a read completes.
+ *
+ * If the index is inconsistent, then we mark the perag as needing an AGFL
+ * reset. The first AGFL update performed then resets the AGFL indexes and
+ * refills the AGFL with known good free blocks, allowing the filesystem to
+ * continue operating normally at the cost of a few leaked free space blocks.
+ */
static xfs_failaddr_t
xfs_agf_verify(
struct xfs_buf *bp)
@@ -2962,7 +3001,6 @@ xfs_agf_verify(
return __this_address;
return NULL;
-
}
static void
@@ -3187,7 +3225,8 @@ xfs_alloc_vextent_check_args(
*/
static int
xfs_alloc_vextent_prepare_ag(
- struct xfs_alloc_arg *args)
+ struct xfs_alloc_arg *args,
+ uint32_t flags)
{
bool need_pag = !args->pag;
int error;
@@ -3196,7 +3235,7 @@ xfs_alloc_vextent_prepare_ag(
args->pag = xfs_perag_get(args->mp, args->agno);
args->agbp = NULL;
- error = xfs_alloc_fix_freelist(args, 0);
+ error = xfs_alloc_fix_freelist(args, flags);
if (error) {
trace_xfs_alloc_vextent_nofix(args);
if (need_pag)
@@ -3336,7 +3375,7 @@ xfs_alloc_vextent_this_ag(
return error;
}
- error = xfs_alloc_vextent_prepare_ag(args);
+ error = xfs_alloc_vextent_prepare_ag(args, 0);
if (!error && args->agbp)
error = xfs_alloc_ag_vextent_size(args);
@@ -3380,7 +3419,7 @@ restart:
for_each_perag_wrap_range(mp, start_agno, restart_agno,
mp->m_sb.sb_agcount, agno, args->pag) {
args->agno = agno;
- error = xfs_alloc_vextent_prepare_ag(args);
+ error = xfs_alloc_vextent_prepare_ag(args, flags);
if (error)
break;
if (!args->agbp) {
@@ -3546,7 +3585,7 @@ xfs_alloc_vextent_exact_bno(
return error;
}
- error = xfs_alloc_vextent_prepare_ag(args);
+ error = xfs_alloc_vextent_prepare_ag(args, 0);
if (!error && args->agbp)
error = xfs_alloc_ag_vextent_exact(args);
@@ -3587,7 +3626,7 @@ xfs_alloc_vextent_near_bno(
if (needs_perag)
args->pag = xfs_perag_grab(mp, args->agno);
- error = xfs_alloc_vextent_prepare_ag(args);
+ error = xfs_alloc_vextent_prepare_ag(args, 0);
if (!error && args->agbp)
error = xfs_alloc_ag_vextent_near(args);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 5dbb25546d0b..85ac470be0da 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -230,7 +230,7 @@ xfs_buf_to_agfl_bno(
return bp->b_addr;
}
-void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
bool skip_discard);
@@ -254,14 +254,14 @@ void xfs_extent_free_get_group(struct xfs_mount *mp,
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
-static inline void
+static inline int
xfs_free_extent_later(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
const struct xfs_owner_info *oinfo)
{
- __xfs_free_extent_later(tp, bno, len, oinfo, false);
+ return __xfs_free_extent_later(tp, bno, len, oinfo, false);
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index cd8870a16fd1..fef35696adb7 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -572,8 +572,12 @@ xfs_bmap_btree_to_extents(
cblock = XFS_BUF_TO_BLOCK(cbp);
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
+
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
- xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
+ error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
+ if (error)
+ return error;
+
ip->i_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
@@ -5230,10 +5234,12 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
- __xfs_free_extent_later(tp, del->br_startblock,
+ error = __xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
(bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN);
+ if (error)
+ goto done;
}
}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 1b40e5f8b1ec..36564ae3084f 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -268,11 +268,14 @@ xfs_bmbt_free_block(
struct xfs_trans *tp = cur->bc_tp;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
struct xfs_owner_info oinfo;
+ int error;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
- xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
- ip->i_nblocks--;
+ error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
+ if (error)
+ return error;
+ ip->i_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
return 0;
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index a16d5de16933..34600f94c2f4 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1834,7 +1834,7 @@ retry:
* might be sparse and only free the regions that are allocated as part of the
* chunk.
*/
-STATIC void
+static int
xfs_difree_inode_chunk(
struct xfs_trans *tp,
xfs_agnumber_t agno,
@@ -1851,10 +1851,10 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
- M_IGEO(mp)->ialloc_blks,
- &XFS_RMAP_OINFO_INODES);
- return;
+ return xfs_free_extent_later(tp,
+ XFS_AGB_TO_FSB(mp, agno, sagbno),
+ M_IGEO(mp)->ialloc_blks,
+ &XFS_RMAP_OINFO_INODES);
}
/* holemask is only 16-bits (fits in an unsigned long) */
@@ -1871,6 +1871,8 @@ xfs_difree_inode_chunk(
XFS_INOBT_HOLEMASK_BITS);
nextbit = startidx + 1;
while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+ int error;
+
nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
nextbit);
/*
@@ -1896,8 +1898,11 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
- contigblk, &XFS_RMAP_OINFO_INODES);
+ error = xfs_free_extent_later(tp,
+ XFS_AGB_TO_FSB(mp, agno, agbno),
+ contigblk, &XFS_RMAP_OINFO_INODES);
+ if (error)
+ return error;
/* reset range to current bit and carry on... */
startidx = endidx = nextbit;
@@ -1905,6 +1910,7 @@ xfs_difree_inode_chunk(
next:
nextbit++;
}
+ return 0;
}
STATIC int
@@ -2003,7 +2009,9 @@ xfs_difree_inobt(
goto error0;
}
- xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
+ error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
+ if (error)
+ goto error0;
} else {
xic->deleted = false;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index f13e0809dc63..269573c82808 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 {
#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
-
/*
* The timestamps are dirty, but not necessarily anything else in the inode
* core. Unlike the other fields above this one must never make it to disk
@@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 {
*/
#define XFS_ILOG_TIMESTAMP 0x4000
+/*
+ * The version field has been changed, but not necessarily anything else of
+ * interest. This must never make it to disk - it is used purely to ensure that
+ * the inode item ->precommit operation can update the fsync flag triggers
+ * in the inode item correctly.
+ */
+#define XFS_ILOG_IVERSION 0x8000
+
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index c1c65774dcc2..b6e21433925c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1151,8 +1151,10 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
tmp.rc_startblock);
- xfs_free_extent_later(cur->bc_tp, fsbno,
+ error = xfs_free_extent_later(cur->bc_tp, fsbno,
tmp.rc_blockcount, NULL);
+ if (error)
+ goto out_error;
}
(*agbno) += tmp.rc_blockcount;
@@ -1210,8 +1212,10 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
ext.rc_startblock);
- xfs_free_extent_later(cur->bc_tp, fsbno,
+ error = xfs_free_extent_later(cur->bc_tp, fsbno,
ext.rc_blockcount, NULL);
+ if (error)
+ goto out_error;
}
skip:
@@ -1976,7 +1980,10 @@ xfs_refcount_recover_cow_leftovers(
rr->rr_rrec.rc_blockcount);
/* Free the block. */
- xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
+ error = xfs_free_extent_later(tp, fsb,
+ rr->rr_rrec.rc_blockcount, NULL);
+ if (error)
+ goto out_trans;
error = xfs_trans_commit(tp);
if (error)
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 8b5547073379..cb4796b6e693 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -40,9 +40,8 @@ xfs_trans_ijoin(
iip->ili_lock_flags = lock_flags;
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
- /*
- * Get a log_item_desc to point at the new item.
- */
+ /* Reset the per-tx dirty context and add the item to the tx. */
+ iip->ili_dirty_flags = 0;
xfs_trans_add_item(tp, &iip->ili_item);
}
@@ -76,17 +75,10 @@ xfs_trans_ichgtime(
/*
* This is called to mark the fields indicated in fieldmask as needing to be
* logged when the transaction is committed. The inode must already be
- * associated with the given transaction.
- *
- * The values for fieldmask are defined in xfs_inode_item.h. We always log all
- * of the core inode if any of it has changed, and we always log all of the
- * inline data/extents/b-tree root if any of them has changed.
- *
- * Grab and pin the cluster buffer associated with this inode to avoid RMW
- * cycles at inode writeback time. Avoid the need to add error handling to every
- * xfs_trans_log_inode() call by shutting down on read error. This will cause
- * transactions to fail and everything to error out, just like if we return a
- * read error in a dirty transaction and cancel it.
+ * associated with the given transaction. All we do here is record where the
+ * inode was dirtied and mark the transaction and inode log item dirty;
+ * everything else is done in the ->precommit log item operation after the
+ * changes in the transaction have been completed.
*/
void
xfs_trans_log_inode(
@@ -96,7 +88,6 @@ xfs_trans_log_inode(
{
struct xfs_inode_log_item *iip = ip->i_itemp;
struct inode *inode = VFS_I(ip);
- uint iversion_flags = 0;
ASSERT(iip);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -105,18 +96,6 @@ xfs_trans_log_inode(
tp->t_flags |= XFS_TRANS_DIRTY;
/*
- * Don't bother with i_lock for the I_DIRTY_TIME check here, as races
- * don't matter - we either will need an extra transaction in 24 hours
- * to log the timestamps, or will clear already cleared fields in the
- * worst case.
- */
- if (inode->i_state & I_DIRTY_TIME) {
- spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
- spin_unlock(&inode->i_lock);
- }
-
- /*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. While we have the
* inode locked exclusively for metadata modification, we can usually
@@ -128,86 +107,10 @@ xfs_trans_log_inode(
if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
if (IS_I_VERSION(inode) &&
inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
- iversion_flags = XFS_ILOG_CORE;
- }
-
- /*
- * If we're updating the inode core or the timestamps and it's possible
- * to upgrade this inode to bigtime format, do so now.
- */
- if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
- xfs_has_bigtime(ip->i_mount) &&
- !xfs_inode_has_bigtime(ip)) {
- ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
- flags |= XFS_ILOG_CORE;
- }
-
- /*
- * Inode verifiers do not check that the extent size hint is an integer
- * multiple of the rt extent size on a directory with both rtinherit
- * and extszinherit flags set. If we're logging a directory that is
- * misconfigured in this way, clear the hint.
- */
- if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
- (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
- ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
- XFS_DIFLAG_EXTSZINHERIT);
- ip->i_extsize = 0;
- flags |= XFS_ILOG_CORE;
+ flags |= XFS_ILOG_IVERSION;
}
- /*
- * Record the specific change for fdatasync optimisation. This allows
- * fdatasync to skip log forces for inodes that are only timestamp
- * dirty.
- */
- spin_lock(&iip->ili_lock);
- iip->ili_fsync_fields |= flags;
-
- if (!iip->ili_item.li_buf) {
- struct xfs_buf *bp;
- int error;
-
- /*
- * We hold the ILOCK here, so this inode is not going to be
- * flushed while we are here. Further, because there is no
- * buffer attached to the item, we know that there is no IO in
- * progress, so nothing will clear the ili_fields while we read
- * in the buffer. Hence we can safely drop the spin lock and
- * read the buffer knowing that the state will not change from
- * here.
- */
- spin_unlock(&iip->ili_lock);
- error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
- if (error) {
- xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
- return;
- }
-
- /*
- * We need an explicit buffer reference for the log item but
- * don't want the buffer to remain attached to the transaction.
- * Hold the buffer but release the transaction reference once
- * we've attached the inode log item to the buffer log item
- * list.
- */
- xfs_buf_hold(bp);
- spin_lock(&iip->ili_lock);
- iip->ili_item.li_buf = bp;
- bp->b_flags |= _XBF_INODES;
- list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
- xfs_trans_brelse(tp, bp);
- }
-
- /*
- * Always OR in the bits from the ili_last_fields field. This is to
- * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
- * in the eventual clearing of the ili_fields bits. See the big comment
- * in xfs_iflush() for an explanation of this coordination mechanism.
- */
- iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
- spin_unlock(&iip->ili_lock);
+ iip->ili_dirty_flags |= flags;
}
int
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 69bc89d0fc68..5bf4326e9783 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -769,14 +769,14 @@ xchk_are_bmaps_contiguous(
* mapping or false if there are no more mappings. Caller must ensure that
* @info.icur is zeroed before the first call.
*/
-static int
+static bool
xchk_bmap_iext_iter(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_bmbt_irec got;
struct xfs_ifork *ifp;
- xfs_filblks_t prev_len;
+ unsigned int nr = 0;
ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
@@ -790,12 +790,12 @@ xchk_bmap_iext_iter(
irec->br_startoff);
return false;
}
+ nr++;
/*
* Iterate subsequent iextent records and merge them with the one
* that we just read, if possible.
*/
- prev_len = irec->br_blockcount;
while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) {
if (!xchk_are_bmaps_contiguous(irec, &got))
break;
@@ -805,20 +805,21 @@ xchk_bmap_iext_iter(
got.br_startoff);
return false;
}
-
- /*
- * Notify the user of mergeable records in the data or attr
- * forks. CoW forks only exist in memory so we ignore them.
- */
- if (info->whichfork != XFS_COW_FORK &&
- prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK)
- xchk_ino_set_preen(info->sc, info->sc->ip->i_ino);
+ nr++;
irec->br_blockcount += got.br_blockcount;
- prev_len = got.br_blockcount;
xfs_iext_next(ifp, &info->icur);
}
+ /*
+ * If the merged mapping could be expressed with fewer bmbt records
+ * than we actually found, notify the user that this fork could be
+ * optimized. CoW forks only exist in memory so we ignore them.
+ */
+ if (nr > 1 && info->whichfork != XFS_COW_FORK &&
+ howmany_64(irec->br_blockcount, XFS_MAX_BMBT_EXTLEN) < nr)
+ xchk_ino_set_preen(info->sc, info->sc->ip->i_ino);
+
return true;
}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index b38e93830dde..e113f2f5c254 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -105,10 +105,10 @@ struct xfs_scrub {
};
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
-#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */
-#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */
-#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */
-#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */
+#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */
+#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
+#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
+#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
/*
* The XCHK_FSGATES* flags reflect functionality in the main filesystem that
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index df7322ed73fa..023d4e0385dd 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -452,10 +452,18 @@ xfs_buf_item_format(
* This is called to pin the buffer associated with the buf log item in memory
* so it cannot be written out.
*
- * We also always take a reference to the buffer log item here so that the bli
- * is held while the item is pinned in memory. This means that we can
- * unconditionally drop the reference count a transaction holds when the
- * transaction is completed.
+ * We take a reference to the buffer log item here so that the BLI life cycle
+ * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
+ * inserted into the AIL.
+ *
+ * We also need to take a reference to the buffer itself as the BLI unpin
+ * processing requires accessing the buffer after the BLI has dropped the final
+ * BLI reference. See xfs_buf_item_unpin() for an explanation.
+ * If unpins race to drop the final BLI reference and only the
+ * BLI owns a reference to the buffer, then the loser of the race can have the
+ * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
+ * pin count ensures the life cycle of the buffer extends for as
+ * long as we hold the buffer pin reference in xfs_buf_item_unpin().
*/
STATIC void
xfs_buf_item_pin(
@@ -470,13 +478,30 @@ xfs_buf_item_pin(
trace_xfs_buf_item_pin(bip);
+ xfs_buf_hold(bip->bli_buf);
atomic_inc(&bip->bli_refcount);
atomic_inc(&bip->bli_buf->b_pin_count);
}
/*
- * This is called to unpin the buffer associated with the buf log item which
- * was previously pinned with a call to xfs_buf_item_pin().
+ * This is called to unpin the buffer associated with the buf log item which was
+ * previously pinned with a call to xfs_buf_item_pin(). We enter this function
+ * with a buffer pin count, a buffer reference and a BLI reference.
+ *
+ * We must drop the BLI reference before we unpin the buffer because the AIL
+ * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
+ * refcount drops to zero, the bli could still be AIL resident and the buffer
+ * submitted for I/O at any point before we return. This can result in IO
+ * completion freeing the buffer while we are still trying to access it here.
+ * This race condition can also occur in shutdown situations where we abort and
+ * unpin buffers from contexts other that journal IO completion.
+ *
+ * Hence we have to hold a buffer reference per pin count to ensure that the
+ * buffer cannot be freed until we have finished processing the unpin operation.
+ * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
+ * are done processing the buffer state. In the case of an abort (remove =
+ * true) then we re-use the current pin reference as the IO reference we hand
+ * off to IO failure handling.
*/
STATIC void
xfs_buf_item_unpin(
@@ -493,24 +518,18 @@ xfs_buf_item_unpin(
trace_xfs_buf_item_unpin(bip);
- /*
- * Drop the bli ref associated with the pin and grab the hold required
- * for the I/O simulation failure in the abort case. We have to do this
- * before the pin count drops because the AIL doesn't acquire a bli
- * reference. Therefore if the refcount drops to zero, the bli could
- * still be AIL resident and the buffer submitted for I/O (and freed on
- * completion) at any point before we return. This can be removed once
- * the AIL properly holds a reference on the bli.
- */
freed = atomic_dec_and_test(&bip->bli_refcount);
- if (freed && !stale && remove)
- xfs_buf_hold(bp);
if (atomic_dec_and_test(&bp->b_pin_count))
wake_up_all(&bp->b_waiters);
- /* nothing to do but drop the pin count if the bli is active */
- if (!freed)
+ /*
+ * Nothing to do but drop the buffer pin reference if the BLI is
+ * still active.
+ */
+ if (!freed) {
+ xfs_buf_rele(bp);
return;
+ }
if (stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
@@ -523,6 +542,15 @@ xfs_buf_item_unpin(
trace_xfs_buf_item_unpin_stale(bip);
/*
+ * The buffer has been locked and referenced since it was marked
+ * stale so we own both lock and reference exclusively here. We
+ * do not need the pin reference any more, so drop it now so
+ * that we only have one reference to drop once item completion
+ * processing is complete.
+ */
+ xfs_buf_rele(bp);
+
+ /*
* If we get called here because of an IO error, we may or may
* not have the item on the AIL. xfs_trans_ail_delete() will
* take care of that situation. xfs_trans_ail_delete() drops
@@ -538,16 +566,30 @@ xfs_buf_item_unpin(
ASSERT(bp->b_log_item == NULL);
}
xfs_buf_relse(bp);
- } else if (remove) {
+ return;
+ }
+
+ if (remove) {
/*
- * The buffer must be locked and held by the caller to simulate
- * an async I/O failure. We acquired the hold for this case
- * before the buffer was unpinned.
+ * We need to simulate an async IO failures here to ensure that
+ * the correct error completion is run on this buffer. This
+ * requires a reference to the buffer and for the buffer to be
+ * locked. We can safely pass ownership of the pin reference to
+ * the IO to ensure that nothing can free the buffer while we
+ * wait for the lock and then run the IO failure completion.
*/
xfs_buf_lock(bp);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioend_fail(bp);
+ return;
}
+
+ /*
+ * BLI has no more active references - it will be moved to the AIL to
+ * manage the remaining BLI/buffer life cycle. There is nothing left for
+ * us to do here so drop the pin reference to the buffer.
+ */
+ xfs_buf_rele(bp);
}
STATIC uint
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 22c13933c8f8..2fc98d313708 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -78,7 +78,6 @@ restart:
*longest = 0;
err = xfs_bmap_longest_free_extent(pag, NULL, longest);
if (err) {
- xfs_perag_rele(pag);
if (err != -EAGAIN)
break;
/* Couldn't lock the AGF, skip this AG. */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0f60e301eb1f..453890942d9f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -454,6 +454,27 @@ xfs_inodegc_queue_all(
return ret;
}
+/* Wait for all queued work and collect errors */
+static int
+xfs_inodegc_wait_all(
+ struct xfs_mount *mp)
+{
+ int cpu;
+ int error = 0;
+
+ flush_workqueue(mp->m_inodegc_wq);
+ for_each_online_cpu(cpu) {
+ struct xfs_inodegc *gc;
+
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ if (gc->error && !error)
+ error = gc->error;
+ gc->error = 0;
+ }
+
+ return error;
+}
+
/*
* Check the validity of the inode we just found it the cache
*/
@@ -1491,15 +1512,14 @@ xfs_blockgc_free_space(
if (error)
return error;
- xfs_inodegc_flush(mp);
- return 0;
+ return xfs_inodegc_flush(mp);
}
/*
* Reclaim all the free space that we can by scheduling the background blockgc
* and inodegc workers immediately and waiting for them all to clear.
*/
-void
+int
xfs_blockgc_flush_all(
struct xfs_mount *mp)
{
@@ -1520,7 +1540,7 @@ xfs_blockgc_flush_all(
for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
flush_delayed_work(&pag->pag_blockgc_work);
- xfs_inodegc_flush(mp);
+ return xfs_inodegc_flush(mp);
}
/*
@@ -1842,13 +1862,17 @@ xfs_inodegc_set_reclaimable(
* This is the last chance to make changes to an otherwise unreferenced file
* before incore reclamation happens.
*/
-static void
+static int
xfs_inodegc_inactivate(
struct xfs_inode *ip)
{
+ int error;
+
trace_xfs_inode_inactivating(ip);
- xfs_inactive(ip);
+ error = xfs_inactive(ip);
xfs_inodegc_set_reclaimable(ip);
+ return error;
+
}
void
@@ -1880,8 +1904,12 @@ xfs_inodegc_worker(
WRITE_ONCE(gc->shrinker_hits, 0);
llist_for_each_entry_safe(ip, n, node, i_gclist) {
+ int error;
+
xfs_iflags_set(ip, XFS_INACTIVATING);
- xfs_inodegc_inactivate(ip);
+ error = xfs_inodegc_inactivate(ip);
+ if (error && !gc->error)
+ gc->error = error;
}
memalloc_nofs_restore(nofs_flag);
@@ -1905,13 +1933,13 @@ xfs_inodegc_push(
* Force all currently queued inode inactivation work to run immediately and
* wait for the work to finish.
*/
-void
+int
xfs_inodegc_flush(
struct xfs_mount *mp)
{
xfs_inodegc_push(mp);
trace_xfs_inodegc_flush(mp, __return_address);
- flush_workqueue(mp->m_inodegc_wq);
+ return xfs_inodegc_wait_all(mp);
}
/*
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 87910191a9dd..1dcdcb23796e 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -62,7 +62,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
unsigned int iwalk_flags);
int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags);
int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm);
-void xfs_blockgc_flush_all(struct xfs_mount *mp);
+int xfs_blockgc_flush_all(struct xfs_mount *mp);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
@@ -80,7 +80,7 @@ void xfs_blockgc_start(struct xfs_mount *mp);
void xfs_inodegc_worker(struct work_struct *work);
void xfs_inodegc_push(struct xfs_mount *mp);
-void xfs_inodegc_flush(struct xfs_mount *mp);
+int xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5808abab786c..9e62cc500140 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1620,16 +1620,7 @@ xfs_inactive_ifree(
*/
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
- /*
- * Just ignore errors at this point. There is nothing we can do except
- * to try to keep going. Make sure it's not a silent error.
- */
- error = xfs_trans_commit(tp);
- if (error)
- xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
- __func__, error);
-
- return 0;
+ return xfs_trans_commit(tp);
}
/*
@@ -1693,12 +1684,12 @@ xfs_inode_needs_inactive(
* now be truncated. Also, we clear all of the read-ahead state
* kept for the inode here since the file is now closed.
*/
-void
+int
xfs_inactive(
xfs_inode_t *ip)
{
struct xfs_mount *mp;
- int error;
+ int error = 0;
int truncate = 0;
/*
@@ -1736,7 +1727,7 @@ xfs_inactive(
* reference to the inode at this point anyways.
*/
if (xfs_can_free_eofblocks(ip, true))
- xfs_free_eofblocks(ip);
+ error = xfs_free_eofblocks(ip);
goto out;
}
@@ -1773,7 +1764,7 @@ xfs_inactive(
/*
* Free the inode.
*/
- xfs_inactive_ifree(ip);
+ error = xfs_inactive_ifree(ip);
out:
/*
@@ -1781,6 +1772,7 @@ out:
* the attached dquots.
*/
xfs_qm_dqdetach(ip);
+ return error;
}
/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 69d21e42c10a..7547caf2f2ab 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -470,7 +470,7 @@ enum layout_break_reason {
(xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID))
int xfs_release(struct xfs_inode *ip);
-void xfs_inactive(struct xfs_inode *ip);
+int xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct mnt_idmap *idmap,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index ca2941ab6cbc..91c847a84e10 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -29,6 +29,153 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_inode_log_item, ili_item);
}
+static uint64_t
+xfs_inode_item_sort(
+ struct xfs_log_item *lip)
+{
+ return INODE_ITEM(lip)->ili_inode->i_ino;
+}
+
+/*
+ * Prior to finally logging the inode, we have to ensure that all the
+ * per-modification inode state changes are applied. This includes VFS inode
+ * state updates, format conversions, verifier state synchronisation and
+ * ensuring the inode buffer remains in memory whilst the inode is dirty.
+ *
+ * We have to be careful when we grab the inode cluster buffer due to lock
+ * ordering constraints. The unlinked inode modifications (xfs_iunlink_item)
+ * require AGI -> inode cluster buffer lock order. The inode cluster buffer is
+ * not locked until ->precommit, so it happens after everything else has been
+ * modified.
+ *
+ * Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we
+ * have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we
+ * cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because
+ * it can be called on a inode (e.g. via bumplink/droplink) before we take the
+ * AGF lock modifying directory blocks.
+ *
+ * Rather than force a complete rework of all the transactions to call
+ * xfs_trans_log_inode() once and once only at the end of every transaction, we
+ * move the pinning of the inode cluster buffer to a ->precommit operation. This
+ * matches how the xfs_iunlink_item locks the inode cluster buffer, and it
+ * ensures that the inode cluster buffer locking is always done last in a
+ * transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode
+ * cluster buffer.
+ *
+ * If we return the inode number as the precommit sort key then we'll also
+ * guarantee that the order all inode cluster buffer locking is the same all the
+ * inodes and unlink items in the transaction.
+ */
+static int
+xfs_inode_item_precommit(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ struct inode *inode = VFS_I(ip);
+ unsigned int flags = iip->ili_dirty_flags;
+
+ /*
+ * Don't bother with i_lock for the I_DIRTY_TIME check here, as races
+ * don't matter - we either will need an extra transaction in 24 hours
+ * to log the timestamps, or will clear already cleared fields in the
+ * worst case.
+ */
+ if (inode->i_state & I_DIRTY_TIME) {
+ spin_lock(&inode->i_lock);
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ }
+
+ /*
+ * If we're updating the inode core or the timestamps and it's possible
+ * to upgrade this inode to bigtime format, do so now.
+ */
+ if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
+ xfs_has_bigtime(ip->i_mount) &&
+ !xfs_inode_has_bigtime(ip)) {
+ ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
+ flags |= XFS_ILOG_CORE;
+ }
+
+ /*
+ * Inode verifiers do not check that the extent size hint is an integer
+ * multiple of the rt extent size on a directory with both rtinherit
+ * and extszinherit flags set. If we're logging a directory that is
+ * misconfigured in this way, clear the hint.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
+
+ /*
+ * Record the specific change for fdatasync optimisation. This allows
+ * fdatasync to skip log forces for inodes that are only timestamp
+ * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
+ * to XFS_ILOG_CORE so that the actual on-disk dirty tracking
+ * (ili_fields) correctly tracks that the version has changed.
+ */
+ spin_lock(&iip->ili_lock);
+ iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
+ if (flags & XFS_ILOG_IVERSION)
+ flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
+
+ if (!iip->ili_item.li_buf) {
+ struct xfs_buf *bp;
+ int error;
+
+ /*
+ * We hold the ILOCK here, so this inode is not going to be
+ * flushed while we are here. Further, because there is no
+ * buffer attached to the item, we know that there is no IO in
+ * progress, so nothing will clear the ili_fields while we read
+ * in the buffer. Hence we can safely drop the spin lock and
+ * read the buffer knowing that the state will not change from
+ * here.
+ */
+ spin_unlock(&iip->ili_lock);
+ error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
+ if (error)
+ return error;
+
+ /*
+ * We need an explicit buffer reference for the log item but
+ * don't want the buffer to remain attached to the transaction.
+ * Hold the buffer but release the transaction reference once
+ * we've attached the inode log item to the buffer log item
+ * list.
+ */
+ xfs_buf_hold(bp);
+ spin_lock(&iip->ili_lock);
+ iip->ili_item.li_buf = bp;
+ bp->b_flags |= _XBF_INODES;
+ list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
+ xfs_trans_brelse(tp, bp);
+ }
+
+ /*
+ * Always OR in the bits from the ili_last_fields field. This is to
+ * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
+ * in the eventual clearing of the ili_fields bits. See the big comment
+ * in xfs_iflush() for an explanation of this coordination mechanism.
+ */
+ iip->ili_fields |= (flags | iip->ili_last_fields);
+ spin_unlock(&iip->ili_lock);
+
+ /*
+ * We are done with the log item transaction dirty state, so clear it so
+ * that it doesn't pollute future transactions.
+ */
+ iip->ili_dirty_flags = 0;
+ return 0;
+}
+
/*
* The logged size of an inode fork is always the current size of the inode
* fork. This means that when an inode fork is relogged, the size of the logged
@@ -662,6 +809,8 @@ xfs_inode_item_committing(
}
static const struct xfs_item_ops xfs_inode_item_ops = {
+ .iop_sort = xfs_inode_item_sort,
+ .iop_precommit = xfs_inode_item_precommit,
.iop_size = xfs_inode_item_size,
.iop_format = xfs_inode_item_format,
.iop_pin = xfs_inode_item_pin,
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index bbd836a44ff0..377e06007804 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -17,6 +17,7 @@ struct xfs_inode_log_item {
struct xfs_log_item ili_item; /* common portion */
struct xfs_inode *ili_inode; /* inode ptr */
unsigned short ili_lock_flags; /* inode lock flags */
+ unsigned int ili_dirty_flags; /* dirty in current tx */
/*
* The ili_lock protects the interactions between the dirty state and
* the flush state of the inode log item. This allows us to do atomic
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 322eb2ee6c55..82c81d20459d 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2711,7 +2711,9 @@ xlog_recover_iunlink_bucket(
* just to flush the inodegc queue and wait for it to
* complete.
*/
- xfs_inodegc_flush(mp);
+ error = xfs_inodegc_flush(mp);
+ if (error)
+ break;
}
prev_agino = agino;
@@ -2719,10 +2721,15 @@ xlog_recover_iunlink_bucket(
}
if (prev_ip) {
+ int error2;
+
ip->i_prev_unlinked = prev_agino;
xfs_irele(prev_ip);
+
+ error2 = xfs_inodegc_flush(mp);
+ if (error2 && !error)
+ return error2;
}
- xfs_inodegc_flush(mp);
return error;
}
@@ -2789,7 +2796,6 @@ xlog_recover_iunlink_ag(
* bucket and remaining inodes on it unreferenced and
* unfreeable.
*/
- xfs_inodegc_flush(pag->pag_mount);
xlog_recover_clear_agi_bucket(pag, bucket);
}
}
@@ -2806,13 +2812,6 @@ xlog_recover_process_iunlinks(
for_each_perag(log->l_mp, agno, pag)
xlog_recover_iunlink_ag(pag);
-
- /*
- * Flush the pending unlinked inodes to ensure that the inactivations
- * are fully completed on disk and the incore inodes can be reclaimed
- * before we signal that recovery is complete.
- */
- xfs_inodegc_flush(log->l_mp);
}
STATIC void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index aaaf5ec13492..6c09f89534d3 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -62,6 +62,7 @@ struct xfs_error_cfg {
struct xfs_inodegc {
struct llist_head list;
struct delayed_work work;
+ int error;
/* approximate count of inodes in the list */
unsigned int items;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f5dc46ce9803..abcc559f3c64 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -616,8 +616,10 @@ xfs_reflink_cancel_cow_blocks(
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
del.br_blockcount);
- xfs_free_extent_later(*tpp, del.br_startblock,
+ error = xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL);
+ if (error)
+ break;
/* Roll the transaction */
error = xfs_defer_finish(tpp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 7e706255f165..4120bd1cba90 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1100,6 +1100,7 @@ xfs_inodegc_init_percpu(
#endif
init_llist_head(&gc->list);
gc->items = 0;
+ gc->error = 0;
INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
}
return 0;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8afc0c080861..8c0bfc9a33b1 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -290,7 +290,9 @@ retry:
* Do not perform a synchronous scan because callers can hold
* other locks.
*/
- xfs_blockgc_flush_all(mp);
+ error = xfs_blockgc_flush_all(mp);
+ if (error)
+ return error;
want_retry = false;
goto retry;
}
@@ -970,6 +972,11 @@ __xfs_trans_commit(
error = xfs_defer_finish_noroll(&tp);
if (error)
goto out_unreserve;
+
+ /* Run precommits from final tx in defer chain. */
+ error = xfs_trans_run_precommits(tp);
+ if (error)
+ goto out_unreserve;
}
/*