From 9597152d98840c2517230740952df97cfcc07e2f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 May 2022 16:37:56 -0400 Subject: Revert "pNFS: nfs3_set_ds_client should set NFS_CS_NOPING" This reverts commit c6eb58435b98bd843d3179664a0195ff25adb2c3. If a transport is down, then we want to fail over to other transports if they are listed in the GETDEVICEINFO reply. Fixes: c6eb58435b98 ("pNFS: nfs3_set_ds_client should set NFS_CS_NOPING") Cc: stable@vger.kernel.org # 5.11.x Signed-off-by: Trond Myklebust --- fs/nfs/nfs3client.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 5601e47360c2..b49359afac88 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -108,7 +108,6 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); - __set_bit(NFS_CS_NOPING, &cl_init.init_flags); __set_bit(NFS_CS_DS, &cl_init.init_flags); /* Use the MDS nfs_client cl_ipaddr. */ -- cgit v1.2.3 From 7836d75467e9d214bdf5c693b32721de729a6e38 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 May 2022 16:09:06 -0400 Subject: pNFS/flexfiles: Report RDMA connection errors to the server The RPC/RDMA driver will return -EPROTO and -ENODEV as connection errors under certain circumstances. Make sure that we handle them and report them to the server. If not, we can end up cycling forever in a LAYOUTGET/LAYOUTRETURN loop. Fixes: a12f996d3413 ("NFSv4/pNFS: Use connections to a DS that are all of the same protocol family") Cc: stable@vger.kernel.org # 5.11.x Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 604be402ae13..7d285561e59f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1131,6 +1131,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, case -EIO: case -ETIMEDOUT: case -EPIPE: + case -EPROTO: + case -ENODEV: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_delete_deviceid(devid->ld, devid->nfs_client, @@ -1236,6 +1238,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -ENOBUFS: case -EPIPE: case -EPERM: + case -EPROTO: + case -ENODEV: *op_status = status = NFS4ERR_NXIO; break; case -EACCES: -- cgit v1.2.3 From 431794e67e238e6fd170499a14fd2abf0a16b5bd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 May 2022 17:08:58 -0400 Subject: pNFS/files: Handle RDMA connection errors correctly The RPC/RDMA driver will return -EPROTO and -ENODEV as connection errors under certain circumstances. Make sure that we handle them correctly and avoid cycling forever in a LAYOUTGET/LAYOUTRETURN loop. Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 2b2661582bbe..ad34a33b0737 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -181,6 +181,8 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -EIO: case -ETIMEDOUT: case -EPIPE: + case -EPROTO: + case -ENODEV: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_mark_deviceid_unavailable(devid); -- cgit v1.2.3 From 6ca0a6f834ed06b2b4c6d1f7f162f2b0d3e196cf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 27 Jun 2022 16:04:02 -0400 Subject: NFS: Fix case insensitive renames For filesystems that are case insensitive and case preserving, we need to be able to rename from one case folded variant of the filename to another. Currently, if we have looked up the target filename before the call to rename, then we may have a hashed dentry with that target name in the dcache, causing the vfs to optimise away the rename. To avoid that, let's drop the target dentry, and leave it to the server to optimise away the rename if that is the correct thing to do. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 0c4e8dd6aa96..d9d277d7fa84 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1739,6 +1739,10 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, goto out_bad; } + if ((flags & LOOKUP_RENAME_TARGET) && d_count(dentry) < 2 && + nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + goto out_bad; + if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); -- cgit v1.2.3 From f07a5d2427fc113dc50c5c818eba8929bc27b8ca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2022 09:16:04 -0400 Subject: NFSv4.1: Don't decrease the value of seq_nr_highest_sent When we're trying to figure out what the server may or may not have seen in terms of request numbers, do not assume that requests with a larger number were missed, just because we saw a reply to a request with a smaller number. Fixes: 3453d5708b33 ("NFSv4.1: Avoid false retries when RPC calls are interrupted") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bb0e84a46d61..628471d06947 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -784,10 +784,9 @@ static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot, if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0) slot->seq_nr_highest_sent = seqnr; } -static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, - u32 seqnr) +static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, u32 seqnr) { - slot->seq_nr_highest_sent = seqnr; + nfs4_slot_sequence_record_sent(slot, seqnr); slot->seq_nr_last_acked = seqnr; } -- cgit v1.2.3 From 7ccafd4b2b9f34e6d8185f796f151c47424e273e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2022 09:22:40 -0400 Subject: NFSv4.1: Handle NFS4ERR_DELAY replies to OP_SEQUENCE correctly Don't assume that the NFS4ERR_DELAY means that the server is processing this slot id. Fixes: 3453d5708b33 ("NFSv4.1: Avoid false retries when RPC calls are interrupted") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 628471d06947..4e0dcc19ca71 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -853,7 +853,6 @@ static int nfs41_sequence_process(struct rpc_task *task, __func__, slot->slot_nr, slot->seq_nr); - nfs4_slot_sequence_acked(slot, slot->seq_nr); goto out_retry; case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_SEQ_FALSE_RETRY: -- cgit v1.2.3 From f931d8374cad3dc09d0f6e3f76689fdb3f104c1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Jun 2022 15:58:22 +0200 Subject: nfs/blocklayout: refactor block device opening Deduplicate the helpers to open a device node by passing a name prefix argument and using the same helper for both kinds of paths. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 42 +++++++++++------------------------------- 1 file changed, 11 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 5e56da748b2a..fea5f8821da5 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -301,18 +301,14 @@ bl_validate_designator(struct pnfs_block_volume *v) } } -/* - * Try to open the udev path for the WWN. At least on Debian the udev - * by-id path will always point to the dm-multipath device if one exists. - */ static struct block_device * -bl_open_udev_path(struct pnfs_block_volume *v) +bl_open_path(struct pnfs_block_volume *v, const char *prefix) { struct block_device *bdev; const char *devname; - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", - v->scsi.designator_len, v->scsi.designator); + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN", + prefix, v->scsi.designator_len, v->scsi.designator); if (!devname) return ERR_PTR(-ENOMEM); @@ -326,28 +322,6 @@ bl_open_udev_path(struct pnfs_block_volume *v) return bdev; } -/* - * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the - * wwn- links will only point to the first discovered SCSI device there. - */ -static struct block_device * -bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) -{ - struct block_device *bdev; - const char *devname; - - devname = kasprintf(GFP_KERNEL, - "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", - v->scsi.designator_type, - v->scsi.designator_len, v->scsi.designator); - if (!devname) - return ERR_PTR(-ENOMEM); - - bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); - kfree(devname); - return bdev; -} - static int bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) @@ -360,9 +334,15 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, if (!bl_validate_designator(v)) return -EINVAL; - bdev = bl_open_dm_mpath_udev_path(v); + /* + * Try to open the RH/Fedora specific dm-mpath udev path first, as the + * wwn- links will only point to the first discovered SCSI device there. + * On other distributions like Debian, the default SCSI by-id path will + * point to the dm-multipath device if one exists. + */ + bdev = bl_open_path(v, "dm-uuid-mpath-0x"); if (IS_ERR(bdev)) - bdev = bl_open_udev_path(v); + bdev = bl_open_path(v, "wwn-0x"); if (IS_ERR(bdev)) return PTR_ERR(bdev); d->bdev = bdev; -- cgit v1.2.3 From 064109db53ecc5d88621d02f36da9f33ca0d64bd Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Thu, 23 Jun 2022 09:58:58 +0800 Subject: NFS: remove redundant code in nfs_file_write() filemap_fdatawait_range() will always return 0, after patch 6c984083ec24 ("NFS: Use of mapping_set_error() results in spurious errors"), it will not save the wb err in struct address_space->flags: result = filemap_fdatawait_range(file->f_mapping, ...) = 0 filemap_check_errors(mapping) = 0 test_bit(..., &mapping->flags) // flags is 0 Signed-off-by: ChenXiaoSong Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2d72b1b7ed74..54237a231687 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -663,8 +663,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) result = filemap_fdatawait_range(file->f_mapping, iocb->ki_pos - written, iocb->ki_pos - 1); - if (result < 0) - goto out; } result = generic_write_sync(iocb, written); if (result < 0) -- cgit v1.2.3 From c77c738c37d0fa8380a671613630298d71099180 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 28 Jun 2022 20:24:26 +0200 Subject: nfs: Replace kmap() with kmap_local_page() The use of kmap() is being deprecated in favor of kmap_local_page(). With kmap_local_page(), the mapping is per thread, CPU local and not globally visible. Furthermore, the mapping can be acquired from any context (including interrupts). Therefore, use kmap_local_page() in nfs_do_filldir() because this mapping is per thread, CPU local, and not globally visible. Suggested-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d9d277d7fa84..7b2230297f6b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1084,7 +1084,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, struct nfs_cache_array *array; unsigned int i; - array = kmap(desc->page); + array = kmap_local_page(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; @@ -1110,7 +1110,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, if (array->page_is_eof) desc->eof = !desc->eob; - kunmap(desc->page); + kunmap_local(array); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", (unsigned long long)desc->dir_cookie); } -- cgit v1.2.3 From 8b4e87a1d68f5ae440c42c15c238fd964fd381d0 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Sat, 2 Jul 2022 08:23:47 +0800 Subject: nfs: fix port value parsing The valid values of nfs options port and mountport are 0 to USHRT_MAX. The fs parser will return a fail for port values that are negative and the sloppy option handling then returns success. But the sloppy option handling is meant to return success for invalid options not valid options with invalid values. Restricting the sloppy option override to handle failure returns for invalid options only is sufficient to resolve this problem. Changes: v2: utilize the return value from fs_parse() to resolve this problem instead of changing the parameter definitions. Suggested-by: Trond Myklebust Signed-off-by: Ian Kent Signed-off-by: Trond Myklebust --- fs/nfs/fs_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 9a16897e8dc6..8f1f9b4af89d 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -484,7 +484,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, opt = fs_parse(fc, nfs_fs_parameters, param, &result); if (opt < 0) - return ctx->sloppy ? 1 : opt; + return (opt == -ENOPARAM && ctx->sloppy) ? 1 : opt; if (fc->security) ctx->has_sec_mnt_opts = 1; -- cgit v1.2.3 From 940261a195080cf1cdcd56948d363fe363b69da1 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 17 Jun 2022 16:23:36 -0400 Subject: NFS: Allow setting rsize / wsize to a multiple of PAGE_SIZE Previously, we required this to value to be a power of 2 for UDP related reasons. This patch keeps the power of 2 rule for UDP but allows more flexibility for TCP and RDMA. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 13 +++++++------ fs/nfs/flexfilelayout/flexfilelayoutdev.c | 6 ++++-- fs/nfs/internal.h | 18 ++++++++++++++++++ fs/nfs/nfs4client.c | 4 ++-- 4 files changed, 31 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index e828504cc396..da8da5cdbbc1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -708,9 +708,9 @@ static int nfs_init_server(struct nfs_server *server, } if (ctx->rsize) - server->rsize = nfs_block_size(ctx->rsize, NULL); + server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto); if (ctx->wsize) - server->wsize = nfs_block_size(ctx->wsize, NULL); + server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto); server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; @@ -755,18 +755,19 @@ error: static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) { + struct nfs_client *clp = server->nfs_client; unsigned long max_rpc_payload, raw_max_rpc_payload; /* Work out a lot of parameters */ if (server->rsize == 0) - server->rsize = nfs_block_size(fsinfo->rtpref, NULL); + server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto); if (server->wsize == 0) - server->wsize = nfs_block_size(fsinfo->wtpref, NULL); + server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto); if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) - server->rsize = nfs_block_size(fsinfo->rtmax, NULL); + server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto); if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) - server->wsize = nfs_block_size(fsinfo->wtmax, NULL); + server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto); raw_max_rpc_payload = rpc_max_payload(server->client); max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index bfa7202ca7be..e028f5a0ef5f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -113,8 +113,10 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_drain_dsaddrs; ds_versions[i].version = be32_to_cpup(p++); ds_versions[i].minor_version = be32_to_cpup(p++); - ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL); - ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL); + ds_versions[i].rsize = nfs_io_size(be32_to_cpup(p++), + server->nfs_client->cl_proto); + ds_versions[i].wsize = nfs_io_size(be32_to_cpup(p++), + server->nfs_client->cl_proto); ds_versions[i].tightly_coupled = be32_to_cpup(p); if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8f8cd6e2d4db..af6d261241ff 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -704,6 +704,24 @@ unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) return nfs_block_bits(bsize, nrbitsp); } +/* + * Compute and set NFS server rsize / wsize + */ +static inline +unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto) +{ + if (iosize < NFS_MIN_FILE_IO_SIZE) + iosize = NFS_DEF_FILE_IO_SIZE; + else if (iosize >= NFS_MAX_FILE_IO_SIZE) + iosize = NFS_MAX_FILE_IO_SIZE; + else + iosize = iosize & PAGE_MASK; + + if (proto == XPRT_TRANSPORT_UDP) + return nfs_block_bits(iosize, NULL); + return iosize; +} + /* * Determine the maximum file size for a superblock */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 47a6cf892c95..3c5678aec006 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1161,9 +1161,9 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) return error; if (ctx->rsize) - server->rsize = nfs_block_size(ctx->rsize, NULL); + server->rsize = nfs_io_size(ctx->rsize, server->nfs_client->cl_proto); if (ctx->wsize) - server->wsize = nfs_block_size(ctx->wsize, NULL); + server->wsize = nfs_io_size(ctx->wsize, server->nfs_client->cl_proto); server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; -- cgit v1.2.3 From 51fd2eb52c0ca8275a906eed81878ef50ae94eb0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 13 Jul 2022 17:46:52 -0400 Subject: NFSv4: Fix races in the legacy idmapper upcall nfs_idmap_instantiate() will cause the process that is waiting in request_key_with_auxdata() to wake up and exit. If there is a second process waiting for the idmap->idmap_mutex, then it may wake up and start a new call to request_key_with_auxdata(). If the call to idmap_pipe_downcall() from the first process has not yet finished calling nfs_idmap_complete_pipe_upcall_locked(), then we may end up triggering the WARN_ON_ONCE() in nfs_idmap_prepare_pipe_upcall(). The fix is to ensure that we clear idmap->idmap_upcall_data before calling nfs_idmap_instantiate(). Fixes: e9ab41b620e4 ("NFSv4: Clean up the legacy idmapper upcall") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4idmap.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index f331866dd418..ec6afd3c4bca 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -561,22 +561,20 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, return true; } -static void -nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) +static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data, + int ret) { - struct key *authkey = idmap->idmap_upcall_data->authkey; - - kfree(idmap->idmap_upcall_data); - idmap->idmap_upcall_data = NULL; - complete_request_key(authkey, ret); - key_put(authkey); + complete_request_key(data->authkey, ret); + key_put(data->authkey); + kfree(data); } -static void -nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) +static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap, + struct idmap_legacy_upcalldata *data, + int ret) { - if (idmap->idmap_upcall_data != NULL) - nfs_idmap_complete_pipe_upcall_locked(idmap, ret); + if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data) + nfs_idmap_complete_pipe_upcall(data, ret); } static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) @@ -613,7 +611,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) ret = rpc_queue_upcall(idmap->idmap_pipe, msg); if (ret < 0) - nfs_idmap_abort_pipe_upcall(idmap, ret); + nfs_idmap_abort_pipe_upcall(idmap, data, ret); return ret; out2: @@ -669,6 +667,7 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) struct request_key_auth *rka; struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; + struct idmap_legacy_upcalldata *data; struct key *authkey; struct idmap_msg im; size_t namelen_in; @@ -678,10 +677,11 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) * will have been woken up and someone else may now have used * idmap_key_cons - so after this point we may no longer touch it. */ - if (idmap->idmap_upcall_data == NULL) + data = xchg(&idmap->idmap_upcall_data, NULL); + if (data == NULL) goto out_noupcall; - authkey = idmap->idmap_upcall_data->authkey; + authkey = data->authkey; rka = get_request_key_auth(authkey); if (mlen != sizeof(im)) { @@ -703,18 +703,17 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { ret = -EINVAL; goto out; -} + } - ret = nfs_idmap_read_and_verify_message(&im, - &idmap->idmap_upcall_data->idmap_msg, - rka->target_key, authkey); + ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg, + rka->target_key, authkey); if (ret >= 0) { key_set_timeout(rka->target_key, nfs_idmap_cache_timeout); ret = mlen; } out: - nfs_idmap_complete_pipe_upcall_locked(idmap, ret); + nfs_idmap_complete_pipe_upcall(data, ret); out_noupcall: return ret; } @@ -728,7 +727,7 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) struct idmap *idmap = data->idmap; if (msg->errno) - nfs_idmap_abort_pipe_upcall(idmap, msg->errno); + nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno); } static void @@ -736,8 +735,11 @@ idmap_release_pipe(struct inode *inode) { struct rpc_inode *rpci = RPC_I(inode); struct idmap *idmap = (struct idmap *)rpci->private; + struct idmap_legacy_upcalldata *data; - nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); + data = xchg(&idmap->idmap_upcall_data, NULL); + if (data) + nfs_idmap_complete_pipe_upcall(data, -EPIPE); } int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) -- cgit v1.2.3 From 8efc4bbe84a8bdd26e848ed93a8900fad1b44ca2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Jul 2022 14:12:18 -0400 Subject: nfs: add new nfs_direct_req tracepoint events Add some new tracepoints to the DIO write code. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 45 ++++++++++-------------------------- fs/nfs/internal.h | 33 ++++++++++++++++++++++++++ fs/nfs/nfstrace.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4eb2a8380a28..ad40e81857ee 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -60,44 +60,12 @@ #include "iostat.h" #include "pnfs.h" #include "fscache.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_VFS static struct kmem_cache *nfs_direct_cachep; -struct nfs_direct_req { - struct kref kref; /* release manager */ - - /* I/O parameters */ - struct nfs_open_context *ctx; /* file open context info */ - struct nfs_lock_context *l_ctx; /* Lock context info */ - struct kiocb * iocb; /* controlling i/o request */ - struct inode * inode; /* target file of i/o */ - - /* completion state */ - atomic_t io_count; /* i/os we're waiting for */ - spinlock_t lock; /* protect completion state */ - - loff_t io_start; /* Start offset for I/O */ - ssize_t count, /* bytes actually processed */ - max_count, /* max expected count */ - bytes_left, /* bytes left to be sent */ - error; /* any reported error */ - struct completion completion; /* wait for i/o completion */ - - /* commit state */ - struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ - struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ - struct work_struct work; - int flags; - /* for write */ -#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ -#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ - /* for read */ -#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ -#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ -}; - static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; static void nfs_direct_write_complete(struct nfs_direct_req *dreq); @@ -595,6 +563,8 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) struct nfs_page *req; int status = data->task.tk_status; + trace_nfs_direct_commit_complete(dreq); + if (status < 0) { /* Errors in commit are fatal */ dreq->error = status; @@ -631,6 +601,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, { struct nfs_direct_req *dreq = cinfo->dreq; + trace_nfs_direct_resched_write(dreq); + spin_lock(&dreq->lock); if (dreq->flags != NFS_ODIRECT_DONE) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; @@ -695,6 +667,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) static void nfs_direct_write_complete(struct nfs_direct_req *dreq) { + trace_nfs_direct_write_complete(dreq); queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */ } @@ -705,6 +678,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) struct nfs_page *req = nfs_list_entry(hdr->pages.next); int flags = NFS_ODIRECT_DONE; + trace_nfs_direct_write_completion(dreq); + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); @@ -759,6 +734,8 @@ static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; + trace_nfs_direct_write_reschedule_io(dreq); + spin_lock(&dreq->lock); if (dreq->error == 0) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; @@ -799,6 +776,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + trace_nfs_direct_write_schedule_iovec(dreq); + nfs_pageio_init_write(&desc, inode, ioflags, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index af6d261241ff..0cabc7f07d9a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -877,3 +877,36 @@ static inline void nfs_set_port(struct sockaddr *sap, int *port, rpc_set_port(sap, *port); } + +struct nfs_direct_req { + struct kref kref; /* release manager */ + + /* I/O parameters */ + struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ + struct kiocb * iocb; /* controlling i/o request */ + struct inode * inode; /* target file of i/o */ + + /* completion state */ + atomic_t io_count; /* i/os we're waiting for */ + spinlock_t lock; /* protect completion state */ + + loff_t io_start; /* Start offset for I/O */ + ssize_t count, /* bytes actually processed */ + max_count, /* max expected count */ + bytes_left, /* bytes left to be sent */ + error; /* any reported error */ + struct completion completion; /* wait for i/o completion */ + + /* commit state */ + struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ + struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ + struct work_struct work; + int flags; + /* for write */ +#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ +#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + /* for read */ +#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ +#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ +}; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 012bd7339862..65388e4a0cd7 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1576,6 +1576,75 @@ TRACE_EVENT(nfs_commit_done, ) ); +#define nfs_show_direct_req_flags(v) \ + __print_flags(v, "|", \ + { NFS_ODIRECT_DO_COMMIT, "DO_COMMIT" }, \ + { NFS_ODIRECT_RESCHED_WRITES, "RESCHED_WRITES" }, \ + { NFS_ODIRECT_SHOULD_DIRTY, "SHOULD DIRTY" }, \ + { NFS_ODIRECT_DONE, "DONE" } ) + +DECLARE_EVENT_CLASS(nfs_direct_req_class, + TP_PROTO( + const struct nfs_direct_req *dreq + ), + + TP_ARGS(dreq), + + TP_STRUCT__entry( + __field(const struct nfs_direct_req *, dreq) + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(int, ref) + __field(loff_t, io_start) + __field(ssize_t, count) + __field(ssize_t, bytes_left) + __field(ssize_t, error) + __field(int, flags) + ), + + TP_fast_assign( + const struct inode *inode = dreq->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->dreq = dreq; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->ref = kref_read(&dreq->kref); + __entry->io_start = dreq->io_start; + __entry->count = dreq->count; + __entry->bytes_left = dreq->bytes_left; + __entry->error = dreq->error; + __entry->flags = dreq->flags; + ), + + TP_printk( + "dreq=%p fileid=%02x:%02x:%llu fhandle=0x%08x ref=%d " + "io_start=%lld count=%zd bytes_left=%zd error=%zd flags=%s", + __entry->dreq, MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->ref, + __entry->io_start, __entry->count, __entry->bytes_left, + __entry->error, nfs_show_direct_req_flags(__entry->flags) + ) +); + +#define DEFINE_NFS_DIRECT_REQ_EVENT(name) \ + DEFINE_EVENT(nfs_direct_req_class, name, \ + TP_PROTO( \ + const struct nfs_direct_req *dreq \ + ), \ + TP_ARGS(dreq)) + +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_commit_complete); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_resched_write); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_complete); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io); + TRACE_EVENT(nfs_fh_to_dentry, TP_PROTO( const struct super_block *sb, -- cgit v1.2.3 From 55051c0ced7d322a169f8603d306ee6ec079f8ae Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Jul 2022 14:12:19 -0400 Subject: nfs: always check dreq->error after a commit When the client gets back a short DIO write, it will then attempt to issue another write to finish the DIO request. If that write then fails (as is often the case in an -ENOSPC situation), then we still may need to issue a COMMIT if the earlier short write was unstable. If that COMMIT then succeeds, then we don't want the client to reschedule the write requests, and to instead just return a short write. Otherwise, we can end up looping over the same DIO write forever. Always consult dreq->error after a successful RPC, even when the flag state is not NFS_ODIRECT_DONE. Link: https://bugzilla.redhat.com/show_bug.cgi?id=2028370 Reported-by: Boyang Xue Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index ad40e81857ee..a47d13296194 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -571,8 +571,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) dreq->max_count = 0; dreq->count = 0; dreq->flags = NFS_ODIRECT_DONE; - } else if (dreq->flags == NFS_ODIRECT_DONE) + } else { status = dreq->error; + } nfs_init_cinfo_from_dreq(&cinfo, dreq); -- cgit v1.2.3 From 69d966510d9f5de81588b37d23a9ee8ccc477b23 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Jul 2022 14:12:20 -0400 Subject: nfs: only issue commit in DIO codepath if we have uncommitted data Currently, we try to determine whether to issue a commit based on nfs_write_need_commit which looks at the current verifier. In the case where we got a short write and then tried to follow it up with one that failed, the verifier can't be trusted. What we really want to know is whether the pgio request had any successful writes that came back as UNSTABLE. Add a new flag to the pgio request, and use that to indicate that we've had a successful unstable write. Only issue a commit if that flag is set. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 +- fs/nfs/write.c | 48 ++++++++++++++++++++++++++++++------------------ include/linux/nfs_xdr.h | 1 + 3 files changed, 32 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index a47d13296194..86df66bb14c5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -690,7 +690,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) { + if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) { if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; flags = dreq->flags; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1c706465d090..16d166bc4099 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1576,25 +1576,37 @@ static int nfs_writeback_done(struct rpc_task *task, nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); trace_nfs_writeback_done(task, hdr); - if (hdr->res.verf->committed < hdr->args.stable && - task->tk_status >= 0) { - /* We tried a write call, but the server did not - * commit data to stable storage even though we - * requested it. - * Note: There is a known bug in Tru64 < 5.0 in which - * the server reports NFS_DATA_SYNC, but performs - * NFS_FILE_SYNC. We therefore implement this checking - * as a dprintk() in order to avoid filling syslog. - */ - static unsigned long complain; + if (task->tk_status >= 0) { + enum nfs3_stable_how committed = hdr->res.verf->committed; + + if (committed == NFS_UNSTABLE) { + /* + * We have some uncommitted data on the server at + * this point, so ensure that we keep track of that + * fact irrespective of what later writes do. + */ + set_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags); + } - /* Note this will print the MDS for a DS write */ - if (time_before(complain, jiffies)) { - dprintk("NFS: faulty NFS server %s:" - " (committed = %d) != (stable = %d)\n", - NFS_SERVER(inode)->nfs_client->cl_hostname, - hdr->res.verf->committed, hdr->args.stable); - complain = jiffies + 300 * HZ; + if (committed < hdr->args.stable) { + /* We tried a write call, but the server did not + * commit data to stable storage even though we + * requested it. + * Note: There is a known bug in Tru64 < 5.0 in which + * the server reports NFS_DATA_SYNC, but performs + * NFS_FILE_SYNC. We therefore implement this checking + * as a dprintk() in order to avoid filling syslog. + */ + static unsigned long complain; + + /* Note this will print the MDS for a DS write */ + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", + NFS_SERVER(inode)->nfs_client->cl_hostname, + committed, hdr->args.stable); + complain = jiffies + 300 * HZ; + } } } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 0e3aa0f5f324..e86cf6642d21 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1600,6 +1600,7 @@ enum { NFS_IOHDR_STAT, NFS_IOHDR_RESEND_PNFS, NFS_IOHDR_RESEND_MDS, + NFS_IOHDR_UNSTABLE_WRITES, }; struct nfs_io_completion; -- cgit v1.2.3 From 33ce83ef0bb048be259ff8ae92ad212918f1ef35 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 22 Jul 2022 15:08:24 -0400 Subject: NFS: Replace fs_context-related dprintk() call sites with tracepoints Contributed as part of the long patch series that converts NFS from using dprintk to tracepoints for observability. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/fs_context.c | 24 +++++++++++++--------- fs/nfs/nfstrace.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 8f1f9b4af89d..4da701fd1424 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -21,6 +21,8 @@ #include "nfs.h" #include "internal.h" +#include "nfstrace.h" + #define NFSDBG_FACILITY NFSDBG_MOUNT #if IS_ENABLED(CONFIG_NFS_V3) @@ -284,7 +286,6 @@ static int nfs_verify_server_address(struct sockaddr *addr) } } - dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); return 0; } @@ -378,7 +379,7 @@ static int nfs_parse_security_flavors(struct fs_context *fc, char *string = param->string, *p; int ret; - dfprintk(MOUNT, "NFS: parsing %s=%s option\n", param->key, param->string); + trace_nfs_mount_assign(param->key, string); while ((p = strsep(&string, ":")) != NULL) { if (!*p) @@ -480,7 +481,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, unsigned int len; int ret, opt; - dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", param->key); + trace_nfs_mount_option(param); opt = fs_parse(fc, nfs_fs_parameters, param, &result); if (opt < 0) @@ -683,6 +684,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, return ret; break; case Opt_vers: + trace_nfs_mount_assign(param->key, param->string); ret = nfs_parse_version_string(fc, param->string); if (ret < 0) return ret; @@ -694,6 +696,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_proto: + trace_nfs_mount_assign(param->key, param->string); protofamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: @@ -729,6 +732,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_mountproto: + trace_nfs_mount_assign(param->key, param->string); mountfamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: @@ -751,6 +755,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_addr: + trace_nfs_mount_assign(param->key, param->string); len = rpc_pton(fc->net_ns, param->string, param->size, &ctx->nfs_server.address, sizeof(ctx->nfs_server._address)); @@ -759,16 +764,19 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->nfs_server.addrlen = len; break; case Opt_clientaddr: + trace_nfs_mount_assign(param->key, param->string); kfree(ctx->client_address); ctx->client_address = param->string; param->string = NULL; break; case Opt_mounthost: + trace_nfs_mount_assign(param->key, param->string); kfree(ctx->mount_server.hostname); ctx->mount_server.hostname = param->string; param->string = NULL; break; case Opt_mountaddr: + trace_nfs_mount_assign(param->key, param->string); len = rpc_pton(fc->net_ns, param->string, param->size, &ctx->mount_server.address, sizeof(ctx->mount_server._address)); @@ -846,7 +854,6 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, */ case Opt_sloppy: ctx->sloppy = true; - dfprintk(MOUNT, "NFS: relaxing parsing rules\n"); break; } @@ -879,10 +886,8 @@ static int nfs_parse_source(struct fs_context *fc, size_t len; const char *end; - if (unlikely(!dev_name || !*dev_name)) { - dfprintk(MOUNT, "NFS: device name not specified\n"); + if (unlikely(!dev_name || !*dev_name)) return -EINVAL; - } /* Is the host name protected with square brakcets? */ if (*dev_name == '[') { @@ -922,7 +927,7 @@ static int nfs_parse_source(struct fs_context *fc, if (!ctx->nfs_server.export_path) goto out_nomem; - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", ctx->nfs_server.export_path); + trace_nfs_mount_path(ctx->nfs_server.export_path); return 0; out_bad_devname: @@ -1116,7 +1121,6 @@ out_no_sec: return nfs_invalf(fc, "NFS: nfs_mount_data version supports only AUTH_SYS"); out_nomem: - dfprintk(MOUNT, "NFS: not enough memory to handle mount options"); return -ENOMEM; out_no_address: @@ -1248,7 +1252,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc, if (IS_ERR(c)) return PTR_ERR(c); ctx->nfs_server.export_path = c; - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + trace_nfs_mount_path(c); c = strndup_user(data->client_addr.data, 16); if (IS_ERR(c)) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 65388e4a0cd7..8bd0c13a7c4b 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1678,6 +1678,65 @@ TRACE_EVENT(nfs_fh_to_dentry, ) ); +TRACE_EVENT(nfs_mount_assign, + TP_PROTO( + const char *option, + const char *value + ), + + TP_ARGS(option, value), + + TP_STRUCT__entry( + __string(option, option) + __string(value, value) + ), + + TP_fast_assign( + __assign_str(option, option); + __assign_str(value, value); + ), + + TP_printk("option %s=%s", + __get_str(option), __get_str(value) + ) +); + +TRACE_EVENT(nfs_mount_option, + TP_PROTO( + const struct fs_parameter *param + ), + + TP_ARGS(param), + + TP_STRUCT__entry( + __string(option, param->key) + ), + + TP_fast_assign( + __assign_str(option, param->key); + ), + + TP_printk("option %s", __get_str(option)) +); + +TRACE_EVENT(nfs_mount_path, + TP_PROTO( + const char *path + ), + + TP_ARGS(path), + + TP_STRUCT__entry( + __string(path, path) + ), + + TP_fast_assign( + __assign_str(path, path); + ), + + TP_printk("path='%s'", __get_str(path)) +); + DECLARE_EVENT_CLASS(nfs_xdr_event, TP_PROTO( const struct xdr_stream *xdr, -- cgit v1.2.3 From d3b00a802c845a6021148ce2e669b5a0b5729959 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 21 Jul 2022 14:21:34 -0400 Subject: NFS: Replace the READ_PLUS decoding code We now take a 2-step process that allows us to place data and hole segments directly at their final position in the xdr_stream without needing to do a bunch of redundant copies to expand holes. Due to the variable lengths of each segment, the xdr metadata might cross page boundaries which I account for by setting a small scratch buffer so xdr_inline_decode() won't fail. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 170 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 88 insertions(+), 82 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 271e5f92ed01..b56f05113d36 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1025,82 +1025,95 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re return decode_op_hdr(xdr, OP_DEALLOCATE); } -static int decode_read_plus_data(struct xdr_stream *xdr, - struct nfs_pgio_args *args, - struct nfs_pgio_res *res) -{ - uint32_t count, recvd; +struct read_plus_segment { + enum data_content4 type; uint64_t offset; - __be32 *p; - - p = xdr_inline_decode(xdr, 8 + 4); - if (!p) - return 1; + union { + struct { + uint64_t length; + } hole; + + struct { + uint32_t length; + unsigned int from; + } data; + }; +}; - p = xdr_decode_hyper(p, &offset); - count = be32_to_cpup(p); - recvd = xdr_align_data(xdr, res->count, xdr_align_size(count)); - if (recvd > count) - recvd = count; - if (res->count + recvd > args->count) { - if (args->count > res->count) - res->count += args->count - res->count; - return 1; - } - res->count += recvd; - if (count > recvd) - return 1; - return 0; +static inline uint64_t read_plus_segment_length(struct read_plus_segment *seg) +{ + return seg->type == NFS4_CONTENT_DATA ? seg->data.length : seg->hole.length; } -static int decode_read_plus_hole(struct xdr_stream *xdr, - struct nfs_pgio_args *args, - struct nfs_pgio_res *res, uint32_t *eof) +static int decode_read_plus_segment(struct xdr_stream *xdr, + struct read_plus_segment *seg) { - uint64_t offset, length, recvd; __be32 *p; - p = xdr_inline_decode(xdr, 8 + 8); + p = xdr_inline_decode(xdr, 4); if (!p) - return 1; - - p = xdr_decode_hyper(p, &offset); - p = xdr_decode_hyper(p, &length); - if (offset != args->offset + res->count) { - /* Server returned an out-of-sequence extent */ - if (offset > args->offset + res->count || - offset + length < args->offset + res->count) { - dprintk("NFS: server returned out of sequence extent: " - "offset/size = %llu/%llu != expected %llu\n", - (unsigned long long)offset, - (unsigned long long)length, - (unsigned long long)(args->offset + - res->count)); - return 1; - } - length -= args->offset + res->count - offset; - } - if (length + res->count > args->count) { - *eof = 0; - if (unlikely(res->count >= args->count)) - return 1; - length = args->count - res->count; - } - recvd = xdr_expand_hole(xdr, res->count, length); - res->count += recvd; + return -EIO; + seg->type = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, seg->type == NFS4_CONTENT_DATA ? 12 : 16); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &seg->offset); + + if (seg->type == NFS4_CONTENT_DATA) { + struct xdr_buf buf; + uint32_t len = be32_to_cpup(p); + + seg->data.length = len; + seg->data.from = xdr_stream_pos(xdr); - if (recvd < length) - return 1; + if (!xdr_stream_subsegment(xdr, &buf, xdr_align_size(len))) + return -EIO; + } else if (seg->type == NFS4_CONTENT_HOLE) { + xdr_decode_hyper(p, &seg->hole.length); + } else + return -EINVAL; return 0; } +static int process_read_plus_segment(struct xdr_stream *xdr, + struct nfs_pgio_args *args, + struct nfs_pgio_res *res, + struct read_plus_segment *seg) +{ + unsigned long offset = seg->offset; + unsigned long length = read_plus_segment_length(seg); + unsigned int bufpos; + + if (offset + length < args->offset) + return 0; + else if (offset > args->offset + args->count) { + res->eof = 0; + return 0; + } else if (offset < args->offset) { + length -= (args->offset - offset); + offset = args->offset; + } else if (offset + length > args->offset + args->count) { + length = (args->offset + args->count) - offset; + res->eof = 0; + } + + bufpos = xdr->buf->head[0].iov_len + (offset - args->offset); + if (seg->type == NFS4_CONTENT_HOLE) + return xdr_stream_zero(xdr, bufpos, length); + else + return xdr_stream_move_subsegment(xdr, seg->data.from, bufpos, length); +} + static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) { struct nfs_pgio_header *hdr = container_of(res, struct nfs_pgio_header, res); struct nfs_pgio_args *args = &hdr->args; - uint32_t eof, segments, type; + uint32_t segments; + struct read_plus_segment *segs; int status, i; + char scratch_buf[16]; __be32 *p; status = decode_op_hdr(xdr, OP_READ_PLUS); @@ -1112,38 +1125,31 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) return -EIO; res->count = 0; - eof = be32_to_cpup(p++); + res->eof = be32_to_cpup(p++); segments = be32_to_cpup(p++); if (segments == 0) - goto out; - - for (i = 0; i < segments; i++) { - p = xdr_inline_decode(xdr, 4); - if (!p) - goto early_out; + return status; - type = be32_to_cpup(p++); - if (type == NFS4_CONTENT_DATA) - status = decode_read_plus_data(xdr, args, res); - else if (type == NFS4_CONTENT_HOLE) - status = decode_read_plus_hole(xdr, args, res, &eof); - else - return -EINVAL; + segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL); + if (!segs) + return -ENOMEM; + xdr_set_scratch_buffer(xdr, &scratch_buf, 32); + status = -EIO; + for (i = 0; i < segments; i++) { + status = decode_read_plus_segment(xdr, &segs[i]); if (status < 0) - return status; - if (status > 0) - goto early_out; + goto out; } + xdr_set_pagelen(xdr, xdr_align_size(args->count)); + for (i = segments; i > 0; i--) + res->count += process_read_plus_segment(xdr, args, res, &segs[i-1]); + status = 0; + out: - res->eof = eof; - return 0; -early_out: - if (unlikely(!i)) - return -EIO; - res->eof = 0; - return 0; + kfree(segs); + return status; } static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) -- cgit v1.2.3 From 88363d3e9db66e697fd0198cc2c1785377f2459a Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:23 -0400 Subject: NFSv4.1 offline trunkable transports on DESTROY_SESSION When session is destroy, some of the transports might no longer be valid trunks for the new session. Offline existing transports. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e0dcc19ca71..3f4e84e9646e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9291,6 +9291,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, if (status) dprintk("NFS: Got error %d from the server on DESTROY_SESSION. " "Session has been destroyed regardless...\n", status); + rpc_clnt_manage_trunked_xprts(session->clp->cl_rpcclient); return status; } -- cgit v1.2.3 From e818bd085baf18cc3271c0f5549d9f5a7069efba Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:27 -0400 Subject: NFSv4.1 remove xprt from xprt_switch if session trunking test fails If we are doing a session trunking test and it fails for the transport, then remove this transport from the xprt_switch group. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3f4e84e9646e..4850e29904e6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8922,6 +8922,9 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, if (status == 0) rpc_clnt_xprt_switch_add_xprt(clnt, xprt); + else if (rpc_clnt_xprt_switch_has_addr(clnt, + (struct sockaddr *)&xprt->addr)) + rpc_clnt_xprt_switch_remove_xprt(clnt, xprt); rpc_put_task(task); } -- cgit v1.2.3 From f201bdfd7c87967480000db8974f683c14aa6eb2 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:31 -0400 Subject: NFSv4.1 probe offline transports for trunking on session creation Once the session is established call into the SUNRPC layer to check if any offlined trunking connections should be re-enabled. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4850e29904e6..5f59de55ac84 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9249,6 +9249,13 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred) int status; unsigned *ptr; struct nfs4_session *session = clp->cl_session; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); @@ -9265,6 +9272,7 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred) ptr = (unsigned *)&session->sess_id.data[0]; dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__, clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]); + rpc_clnt_probe_trunked_xprts(clp->cl_rpcclient, &rpcdata); out: return status; } -- cgit v1.2.3 From e35a5e782f67ed76a65ad0f23a484444a95f000f Mon Sep 17 00:00:00 2001 From: Zhang Xianwei Date: Wed, 27 Jul 2022 18:01:07 +0800 Subject: NFSv4.1: RECLAIM_COMPLETE must handle EACCES A client should be able to handle getting an EACCES error while doing a mount operation to reclaim state due to NFS4CLNT_RECLAIM_REBOOT being set. If the server returns RPC_AUTH_BADCRED because authentication failed when we execute "exportfs -au", then RECLAIM_COMPLETE will go a wrong way. After mount succeeds, all OPEN call will fail due to an NFS4ERR_GRACE error being returned. This patch is to fix it by resending a RPC request. Signed-off-by: Zhang Xianwei Signed-off-by: Yi Wang Fixes: aa5190d0ed7d ("NFSv4: Kill nfs4_async_handle_error() abuses by NFSv4.1") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5f59de55ac84..2d7c14ade193 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9487,6 +9487,9 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf rpc_delay(task, NFS4_POLL_RETRY_MAX); fallthrough; case -NFS4ERR_RETRY_UNCACHED_REP: + case -EACCES: + dprintk("%s: failed to reclaim complete error %d for server %s, retrying\n", + __func__, task->tk_status, clp->cl_hostname); return -EAGAIN; case -NFS4ERR_BADSESSION: case -NFS4ERR_DEADSESSION: -- cgit v1.2.3 From b1a28f2eb9ea7a5a1763fe53fe699aa0feae4231 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Aug 2022 14:16:51 -0400 Subject: NFS: nfs_async_write_reschedule_io must not recurse into the writeback code It is not safe to call filemap_fdatawrite_range() from nfs_async_write_reschedule_io(), since we're often calling from a page reclaim context. Just let fsync() redrive the writeback for us. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 16d166bc4099..4adf2b488da1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1444,8 +1444,6 @@ static void nfs_async_write_error(struct list_head *head, int error) static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) { nfs_async_write_error(&hdr->pages, 0); - filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset, - hdr->args.offset + hdr->args.count - 1); } static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { -- cgit v1.2.3 From 2135e5d56278ffdb1c2e6d325dc6b87f669b9dac Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Aug 2022 15:48:50 -0400 Subject: NFSv4/pnfs: Fix a use-after-free bug in open If someone cancels the open RPC call, then we must not try to free either the open slot or the layoutget operation arguments, since they are likely still in use by the hung RPC call. Fixes: 6949493884fe ("NFSv4: Don't hold the layoutget locks across multiple RPC calls") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2d7c14ade193..3ed14a2a84a4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3096,12 +3096,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: - if (opendata->lgp) { - nfs4_lgopen_release(opendata->lgp); - opendata->lgp = NULL; - } - if (!opendata->cancelled) + if (!opendata->cancelled) { + if (opendata->lgp) { + nfs4_lgopen_release(opendata->lgp); + opendata->lgp = NULL; + } nfs4_sequence_free_slot(&opendata->o_res.seq_res); + } return ret; } -- cgit v1.2.3 From 3c59366c207e4c6c6569524af606baf017a55c61 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 Aug 2022 10:33:34 +1000 Subject: NFS: don't unhash dentry during unlink/rename NFS unlink() (and rename over existing target) must determine if the file is open, and must perform a "silly rename" instead of an unlink (or before rename) if it is. Otherwise the client might hold a file open which has been removed on the server. Consequently if it determines that the file isn't open, it must block any subsequent opens until the unlink/rename has been completed on the server. This is currently achieved by unhashing the dentry. This forces any open attempt to the slow-path for lookup which will block on i_rwsem on the directory until the unlink/rename completes. A future patch will change the VFS to only get a shared lock on i_rwsem for unlink, so this will no longer work. Instead we introduce an explicit interlock. A special value is stored in dentry->d_fsdata while the unlink/rename is running and ->d_revalidate blocks while that value is present. When ->d_revalidate unblocks, the dentry will be invalid. This closes the race without requiring exclusion on i_rwsem. d_fsdata is already used in two different ways. 1/ an IS_ROOT directory dentry might have a "devname" stored in d_fsdata. Such a dentry doesn't have a name and so cannot be the target of unlink or rename. For safety we check if an old devname is still stored, and remove it if it is. 2/ a dentry with DCACHE_NFSFS_RENAMED set will have a 'struct nfs_unlinkdata' stored in d_fsdata. While this is set maydelete() will fail, so an unlink or rename will never proceed on such a dentry. Neither of these can be in effect when a dentry is the target of unlink or rename. So we can expect d_fsdata to be NULL, and store a special value ((void*)1) which is given the name NFS_FSDATA_BLOCKED to indicate that any lookup will be blocked. The d_count() is incremented under d_lock() when a lookup finds the dentry, so we check d_count() is low, and set NFS_FSDATA_BLOCKED under the same lock to avoid any races. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 72 +++++++++++++++++++++++++++++++++++++------------- include/linux/nfs_fs.h | 9 +++++++ 2 files changed, 63 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7b2230297f6b..dbab3caa15ed 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1782,6 +1782,8 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, int ret; if (flags & LOOKUP_RCU) { + if (dentry->d_fsdata == NFS_FSDATA_BLOCKED) + return -ECHILD; parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) @@ -1790,6 +1792,9 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; } else { + /* Wait for unlink to complete */ + wait_var_event(&dentry->d_fsdata, + dentry->d_fsdata != NFS_FSDATA_BLOCKED); parent = dget_parent(dentry); ret = reval(d_inode(parent), dentry, flags); dput(parent); @@ -2458,7 +2463,6 @@ out: int nfs_unlink(struct inode *dir, struct dentry *dentry) { int error; - int need_rehash = 0; dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -2473,15 +2477,25 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) error = nfs_sillyrename(dir, dentry); goto out; } - if (!d_unhashed(dentry)) { - __d_drop(dentry); - need_rehash = 1; - } + /* We must prevent any concurrent open until the unlink + * completes. ->d_revalidate will wait for ->d_fsdata + * to clear. We set it here to ensure no lookup succeeds until + * the unlink is complete on the server. + */ + error = -ETXTBSY; + if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) || + WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) + goto out; + if (dentry->d_fsdata) + /* old devname */ + kfree(dentry->d_fsdata); + dentry->d_fsdata = NFS_FSDATA_BLOCKED; + spin_unlock(&dentry->d_lock); error = nfs_safe_remove(dentry); nfs_dentry_remove_handle_error(dir, dentry, error); - if (need_rehash) - d_rehash(dentry); + dentry->d_fsdata = NULL; + wake_up_var(&dentry->d_fsdata); out: trace_nfs_unlink_exit(dir, dentry, error); return error; @@ -2588,6 +2602,15 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); +static void +nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *new_dentry = data->new_dentry; + + new_dentry->d_fsdata = NULL; + wake_up_var(&new_dentry->d_fsdata); +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2618,8 +2641,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct dentry *dentry = NULL, *rehash = NULL; + struct dentry *dentry = NULL; struct rpc_task *task; + bool must_unblock = false; int error = -EBUSY; if (flags) @@ -2637,18 +2661,27 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * the new target. */ if (new_inode && !S_ISDIR(new_inode->i_mode)) { - /* - * To prevent any new references to the target during the - * rename, we unhash the dentry in advance. + /* We must prevent any concurrent open until the unlink + * completes. ->d_revalidate will wait for ->d_fsdata + * to clear. We set it here to ensure no lookup succeeds until + * the unlink is complete on the server. */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - rehash = new_dentry; + error = -ETXTBSY; + if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) || + WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED)) + goto out; + if (new_dentry->d_fsdata) { + /* old devname */ + kfree(new_dentry->d_fsdata); + new_dentry->d_fsdata = NULL; } + spin_lock(&new_dentry->d_lock); if (d_count(new_dentry) > 2) { int err; + spin_unlock(&new_dentry->d_lock); + /* copy the target dentry's name */ dentry = d_alloc(new_dentry->d_parent, &new_dentry->d_name); @@ -2661,14 +2694,19 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto out; new_dentry = dentry; - rehash = NULL; new_inode = NULL; + } else { + new_dentry->d_fsdata = NFS_FSDATA_BLOCKED; + must_unblock = true; + spin_unlock(&new_dentry->d_lock); } + } if (S_ISREG(old_inode->i_mode)) nfs_sync_inode(old_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, + must_unblock ? nfs_unblock_rename : NULL); if (IS_ERR(task)) { error = PTR_ERR(task); goto out; @@ -2692,8 +2730,6 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, spin_unlock(&old_inode->i_lock); } out: - if (rehash) - d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); if (!error) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index a17c337dbdf1..b32ed68e7dc4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -617,6 +617,15 @@ nfs_fileid_to_ino_t(u64 fileid) #define NFS_JUKEBOX_RETRY_TIME (5 * HZ) +/* We need to block new opens while a file is being unlinked. + * If it is opened *before* we decide to unlink, we will silly-rename + * instead. If it is opened *after*, then we need to create or will fail. + * If we allow the two to race, we could end up with a file that is open + * but deleted on the server resulting in ESTALE. + * So use ->d_fsdata to record when the unlink is happening + * and block dentry revalidation while it is set. + */ +#define NFS_FSDATA_BLOCKED ((void*)1) # undef ifdebug # ifdef NFS_DEBUG -- cgit v1.2.3 From af887e437bb298752b2edc5834048b8151b8aea0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Aug 2022 12:50:28 -0400 Subject: NFS: Improve write error tracing Don't leak request pointers, but use the "device:inode" labelling that is used by all the other trace points. Furthermore, replace use of page indexes with an offset, again in order to align behaviour with other NFS trace points. Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 36 +++++++++++++++++++++--------------- fs/nfs/write.c | 8 +++++--- include/linux/nfs_page.h | 3 +-- 3 files changed, 27 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 8bd0c13a7c4b..731eecfdf49a 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1447,44 +1447,50 @@ TRACE_EVENT(nfs_writeback_done, DECLARE_EVENT_CLASS(nfs_page_error_class, TP_PROTO( + const struct inode *inode, const struct nfs_page *req, int error ), - TP_ARGS(req, error), + TP_ARGS(inode, req, error), TP_STRUCT__entry( - __field(const void *, req) - __field(pgoff_t, index) - __field(unsigned int, offset) - __field(unsigned int, pgbase) - __field(unsigned int, bytes) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(unsigned int, count) __field(int, error) ), TP_fast_assign( - __entry->req = req; - __entry->index = req->wb_index; - __entry->offset = req->wb_offset; - __entry->pgbase = req->wb_pgbase; - __entry->bytes = req->wb_bytes; + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->offset = req_offset(req); + __entry->count = req->wb_bytes; __entry->error = error; ), TP_printk( - "req=%p index=%lu offset=%u pgbase=%u bytes=%u error=%d", - __entry->req, __entry->index, __entry->offset, - __entry->pgbase, __entry->bytes, __entry->error + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u", __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->offset, + __entry->count ) ); #define DEFINE_NFS_PAGEERR_EVENT(name) \ DEFINE_EVENT(nfs_page_error_class, name, \ TP_PROTO( \ + const struct inode *inode, \ const struct nfs_page *req, \ int error \ ), \ - TP_ARGS(req, error)) + TP_ARGS(inode, req, error)) DEFINE_NFS_PAGEERR_EVENT(nfs_write_error); DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4adf2b488da1..4a3796811b4b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -592,7 +592,8 @@ nfs_lock_and_join_requests(struct page *page) static void nfs_write_error(struct nfs_page *req, int error) { - trace_nfs_write_error(req, error); + trace_nfs_write_error(page_file_mapping(req->wb_page)->host, req, + error); nfs_mapping_set_error(req->wb_page, error); nfs_inode_remove_request(req); nfs_end_page_writeback(req); @@ -1000,7 +1001,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { - trace_nfs_comp_error(req, hdr->error); + trace_nfs_comp_error(hdr->inode, req, hdr->error); nfs_mapping_set_error(req->wb_page, hdr->error); goto remove_req; } @@ -1882,7 +1883,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) (long long)req_offset(req)); if (status < 0) { if (req->wb_page) { - trace_nfs_commit_error(req, status); + trace_nfs_commit_error(data->inode, req, + status); nfs_mapping_set_error(req->wb_page, status); nfs_inode_remove_request(req); } diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index f0373a6cb5fb..ba7e2e4b0926 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -202,8 +202,7 @@ nfs_list_entry(struct list_head *head) return list_entry(head, struct nfs_page, wb_list); } -static inline -loff_t req_offset(struct nfs_page *req) +static inline loff_t req_offset(const struct nfs_page *req) { return (((loff_t)req->wb_index) << PAGE_SHIFT) + req->wb_offset; } -- cgit v1.2.3 From b313eb91521872284c0e395773fc6e9827fb1446 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Aug 2022 13:46:41 -0400 Subject: NFS: Improve O_DIRECT tracing Switch the formatting to match the other NFS tracepoints. Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 731eecfdf49a..8e87cf8e5e78 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1597,12 +1597,10 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, TP_ARGS(dreq), TP_STRUCT__entry( - __field(const struct nfs_direct_req *, dreq) __field(dev_t, dev) __field(u64, fileid) __field(u32, fhandle) - __field(int, ref) - __field(loff_t, io_start) + __field(loff_t, offset) __field(ssize_t, count) __field(ssize_t, bytes_left) __field(ssize_t, error) @@ -1614,12 +1612,10 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = &nfsi->fh; - __entry->dreq = dreq; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); - __entry->ref = kref_read(&dreq->kref); - __entry->io_start = dreq->io_start; + __entry->offset = dreq->io_start; __entry->count = dreq->count; __entry->bytes_left = dreq->bytes_left; __entry->error = dreq->error; @@ -1627,13 +1623,14 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, ), TP_printk( - "dreq=%p fileid=%02x:%02x:%llu fhandle=0x%08x ref=%d " - "io_start=%lld count=%zd bytes_left=%zd error=%zd flags=%s", - __entry->dreq, MAJOR(__entry->dev), MINOR(__entry->dev), + "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zd bytes_left=%zd flags=%s", + __entry->error, MAJOR(__entry->dev), + MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle, __entry->ref, - __entry->io_start, __entry->count, __entry->bytes_left, - __entry->error, nfs_show_direct_req_flags(__entry->flags) + __entry->fhandle, __entry->offset, + __entry->count, __entry->bytes_left, + nfs_show_direct_req_flags(__entry->flags) ) ); -- cgit v1.2.3 From 3fa5cbdc44de190f2c5605ba7db015ae0d26f668 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Aug 2022 13:59:09 -0400 Subject: NFS: Improve readpage/writepage tracing Switch formatting to better match that used by other NFS tracepoints. Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 54 ++++++++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 8e87cf8e5e78..8c6cc58679ff 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1137,7 +1137,7 @@ TRACE_EVENT(nfs_readpage_done, __field(u32, arg_count) __field(u32, res_count) __field(bool, eof) - __field(int, status) + __field(int, error) ), TP_fast_assign( @@ -1146,7 +1146,7 @@ TRACE_EVENT(nfs_readpage_done, const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; - __entry->status = task->tk_status; + __entry->error = task->tk_status; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; @@ -1157,14 +1157,13 @@ TRACE_EVENT(nfs_readpage_done, ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u status=%d%s", + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, - __entry->res_count, __entry->status, - __entry->eof ? " eof" : "" + __entry->res_count, __entry->eof ? " eof" : "" ) ); @@ -1184,7 +1183,7 @@ TRACE_EVENT(nfs_readpage_short, __field(u32, arg_count) __field(u32, res_count) __field(bool, eof) - __field(int, status) + __field(int, error) ), TP_fast_assign( @@ -1193,7 +1192,7 @@ TRACE_EVENT(nfs_readpage_short, const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; - __entry->status = task->tk_status; + __entry->error = task->tk_status; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; @@ -1204,14 +1203,13 @@ TRACE_EVENT(nfs_readpage_short, ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u status=%d%s", + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, - __entry->res_count, __entry->status, - __entry->eof ? " eof" : "" + __entry->res_count, __entry->eof ? " eof" : "" ) ); @@ -1323,7 +1321,7 @@ TRACE_EVENT(nfs_pgio_error, __field(u32, arg_count) __field(u32, res_count) __field(loff_t, pos) - __field(int, status) + __field(int, error) ), TP_fast_assign( @@ -1332,7 +1330,7 @@ TRACE_EVENT(nfs_pgio_error, const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; - __entry->status = error; + __entry->error = error; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; @@ -1341,12 +1339,12 @@ TRACE_EVENT(nfs_pgio_error, __entry->fhandle = nfs_fhandle_hash(fh); ), - TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u pos=%llu status=%d", + TP_printk("error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u pos=%llu", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, __entry->res_count, - __entry->pos, __entry->status + __entry->pos ) ); @@ -1406,7 +1404,7 @@ TRACE_EVENT(nfs_writeback_done, __field(loff_t, offset) __field(u32, arg_count) __field(u32, res_count) - __field(int, status) + __field(int, error) __field(unsigned long, stable) __array(char, verifier, NFS4_VERIFIER_SIZE) ), @@ -1418,7 +1416,7 @@ TRACE_EVENT(nfs_writeback_done, hdr->args.fh : &nfsi->fh; const struct nfs_writeverf *verf = hdr->res.verf; - __entry->status = task->tk_status; + __entry->error = task->tk_status; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; @@ -1432,14 +1430,14 @@ TRACE_EVENT(nfs_writeback_done, ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u status=%d stable=%s " - "verifier=%s", + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u stable=%s " + "verifier=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, - __entry->res_count, __entry->status, + __entry->res_count, show_nfs_stable_how(__entry->stable), show_nfs4_verifier(__entry->verifier) ) @@ -1547,7 +1545,7 @@ TRACE_EVENT(nfs_commit_done, __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) - __field(int, status) + __field(int, error) __field(unsigned long, stable) __array(char, verifier, NFS4_VERIFIER_SIZE) ), @@ -1559,7 +1557,7 @@ TRACE_EVENT(nfs_commit_done, data->args.fh : &nfsi->fh; const struct nfs_writeverf *verf = data->res.verf; - __entry->status = task->tk_status; + __entry->error = task->tk_status; __entry->offset = data->args.offset; __entry->stable = verf->committed; memcpy(__entry->verifier, @@ -1571,12 +1569,12 @@ TRACE_EVENT(nfs_commit_done, ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld status=%d stable=%s verifier=%s", + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld stable=%s verifier=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, - (long long)__entry->offset, __entry->status, + (long long)__entry->offset, show_nfs_stable_how(__entry->stable), show_nfs4_verifier(__entry->verifier) ) -- cgit v1.2.3