From 65294c1f2c5e72b15b76e16c8c8cfd9359fc9f6f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:48 -0400 Subject: nfsd: add a new struct file caching facility to nfsd Currently, NFSv2/3 reads and writes have to open a file, do the read or write and then close it again for each RPC. This is highly inefficient, especially when the underlying filesystem has a relatively slow open routine. This patch adds a new open file cache to knfsd. Rather than doing an open for each RPC, the read/write handlers can call into this cache to see if there is one already there for the correct filehandle and NFS_MAY_READ/WRITE flags. If there isn't an entry, then we create a new one and attempt to perform the open. If there is, then we wait until the entry is fully instantiated and return it if it is at the end of the wait. If it's not, then we attempt to take over construction. Since the main goal is to speed up NFSv2/3 I/O, we don't want to close these files on last put of these objects. We need to keep them around for a little while since we never know when the next READ/WRITE will come in. Cache entries have a hardcoded 1s timeout, and we have a recurring workqueue job that walks the cache and purges any entries that have expired. Signed-off-by: Jeff Layton Signed-off-by: Weston Andros Adamson Signed-off-by: Richard Sharpe Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/nfsd/vfs.h') diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index db351247892d..31fdae34e028 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -75,8 +75,11 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ +int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); +__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, + int, struct file **); struct raparms; __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, -- cgit v1.2.3 From 501cb1849f865960501d19d54e6a5af306f9b6fd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:56 -0400 Subject: nfsd: rip out the raparms cache The raparms cache was set up in order to ensure that we carry readahead information forward from one RPC call to the next. In other words, it was set up because each RPC call was forced to open a struct file, then close it, causing the loss of readahead information that is normally cached in that struct file, and used to keep the page cache filled when a user calls read() multiple times on the same file descriptor. Now that we cache the struct file, and reuse it for all the I/O calls to a given file by a given user, we no longer have to keep a separate readahead cache. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 13 +---- fs/nfsd/vfs.c | 149 ------------------------------------------------------- fs/nfsd/vfs.h | 6 --- 3 files changed, 1 insertion(+), 167 deletions(-) (limited to 'fs/nfsd/vfs.h') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a6b1eab7b722..d02712ca2685 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -317,22 +317,12 @@ static int nfsd_startup_generic(int nrservs) ret = nfsd_file_cache_init(); if (ret) goto dec_users; - /* - * Readahead param cache - will no-op if it already exists. - * (Note therefore results will be suboptimal if number of - * threads is modified after nfsd start.) - */ - ret = nfsd_racache_init(2*nrservs); - if (ret) - goto out_file_cache; ret = nfs4_state_start(); if (ret) - goto out_racache; + goto out_file_cache; return 0; -out_racache: - nfsd_racache_shutdown(); out_file_cache: nfsd_file_cache_shutdown(); dec_users: @@ -347,7 +337,6 @@ static void nfsd_shutdown_generic(void) nfs4_state_shutdown(); nfsd_file_cache_shutdown(); - nfsd_racache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ec254bff1893..8e2c8f36eba3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -49,34 +49,6 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP - -/* - * This is a cache of readahead params that help us choose the proper - * readahead strategy. Initially, we set all readahead parameters to 0 - * and let the VFS handle things. - * If you increase the number of cached files very much, you'll need to - * add a hash table here. - */ -struct raparms { - struct raparms *p_next; - unsigned int p_count; - ino_t p_ino; - dev_t p_dev; - int p_set; - struct file_ra_state p_ra; - unsigned int p_hindex; -}; - -struct raparm_hbucket { - struct raparms *pb_head; - spinlock_t pb_lock; -} ____cacheline_aligned_in_smp; - -#define RAPARM_HASH_BITS 4 -#define RAPARM_HASH_SIZE (1<i_sb->s_dev; - ino_t ino = inode->i_ino; - struct raparms *ra, **rap, **frap = NULL; - int depth = 0; - unsigned int hash; - struct raparm_hbucket *rab; - - hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; - rab = &raparm_hash[hash]; - - spin_lock(&rab->pb_lock); - for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { - if (ra->p_ino == ino && ra->p_dev == dev) - goto found; - depth++; - if (ra->p_count == 0) - frap = rap; - } - depth = nfsdstats.ra_size; - if (!frap) { - spin_unlock(&rab->pb_lock); - return NULL; - } - rap = frap; - ra = *frap; - ra->p_dev = dev; - ra->p_ino = ino; - ra->p_set = 0; - ra->p_hindex = hash; -found: - if (rap != &rab->pb_head) { - *rap = ra->p_next; - ra->p_next = rab->pb_head; - rab->pb_head = ra; - } - ra->p_count++; - nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; - spin_unlock(&rab->pb_lock); - - if (ra->p_set) - file->f_ra = ra->p_ra; - return ra; -} - -void nfsd_put_raparams(struct file *file, struct raparms *ra) -{ - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); -} - /* * Grab and keep cached pages associated with a file in the svc_rqst * so that they can be passed to the network sendmsg/sendpage routines @@ -2094,63 +2005,3 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return err? nfserrno(err) : 0; } - -void -nfsd_racache_shutdown(void) -{ - struct raparms *raparm, *last_raparm; - unsigned int i; - - dprintk("nfsd: freeing readahead buffers.\n"); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - raparm = raparm_hash[i].pb_head; - while(raparm) { - last_raparm = raparm; - raparm = raparm->p_next; - kfree(last_raparm); - } - raparm_hash[i].pb_head = NULL; - } -} -/* - * Initialize readahead param cache - */ -int -nfsd_racache_init(int cache_size) -{ - int i; - int j = 0; - int nperbucket; - struct raparms **raparm = NULL; - - - if (raparm_hash[0].pb_head) - return 0; - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - nperbucket = max(2, nperbucket); - cache_size = nperbucket * RAPARM_HASH_SIZE; - - dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - spin_lock_init(&raparm_hash[i].pb_lock); - - raparm = &raparm_hash[i].pb_head; - for (j = 0; j < nperbucket; j++) { - *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); - if (!*raparm) - goto out_nomem; - raparm = &(*raparm)->p_next; - } - *raparm = NULL; - } - - nfsdstats.ra_size = cache_size; - return 0; - -out_nomem: - dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); - nfsd_racache_shutdown(); - return -ENOMEM; -} diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 31fdae34e028..e0f7792165a6 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -40,8 +40,6 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); /* nfsd/vfs.c */ -int nfsd_racache_init(int); -void nfsd_racache_shutdown(void); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, @@ -80,7 +78,6 @@ __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); __be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -struct raparms; __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count); @@ -118,9 +115,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); -struct raparms *nfsd_init_raparms(struct file *file); -void nfsd_put_raparams(struct file *file, struct raparms *ra); - static inline int fh_want_write(struct svc_fh *fh) { int ret; -- cgit v1.2.3 From 83a63072c815e8a042c60fa964dcbde2a6df0e87 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Aug 2019 13:03:11 -0400 Subject: nfsd: fix nfs read eof detection Currently, the knfsd server assumes that a short read indicates an end of file. That assumption is incorrect. The short read means that either we've hit the end of file, or we've hit a read error. In the case of a read error, the client may want to retry (as per the implementation recommendations in RFC1813 and RFC7530), but currently it is being told that it hit an eof. Move the code to detect eof from version specific code into the generic nfsd read. Report eof only in the two following cases: 1) read() returns a zero length short read with no error. 2) the offset+length of the read is >= the file size. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 9 ++------- fs/nfsd/nfs4xdr.c | 11 +++-------- fs/nfsd/nfsproc.c | 4 +++- fs/nfsd/vfs.c | 37 ++++++++++++++++++++++++++----------- fs/nfsd/vfs.h | 28 ++++++---------------------- fs/nfsd/xdr3.h | 2 +- 6 files changed, 41 insertions(+), 50 deletions(-) (limited to 'fs/nfsd/vfs.h') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9bc32af4e2da..cea68d8411ac 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -172,13 +172,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); - if (nfserr == 0) { - struct inode *inode = d_inode(resp->fh.fh_dentry); - resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset, - inode->i_size); - } - + &resp->count, + &resp->eof); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c1fc2641e3e7..533d0fc3c96b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3472,7 +3472,7 @@ static __be32 nfsd4_encode_splice_read( len = maxcount; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, - file, read->rd_offset, &maxcount); + file, read->rd_offset, &maxcount, &eof); read->rd_length = maxcount; if (nfserr) { /* @@ -3484,9 +3484,6 @@ static __be32 nfsd4_encode_splice_read( return nfserr; } - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - *(p++) = htonl(eof); *(p++) = htonl(maxcount); @@ -3557,15 +3554,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, len = maxcount; nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, &maxcount); + resp->rqstp->rq_vec, read->rd_vlen, &maxcount, + &eof); read->rd_length = maxcount; if (nfserr) return nfserr; xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); tmp = htonl(maxcount); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0d20fd161225..c83ddac22f38 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -172,6 +172,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; __be32 nfserr; + u32 eof; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -195,7 +196,8 @@ nfsd_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); + &resp->count, + &eof); if (nfserr) return nfserr; return fh_getattr(&resp->fh, &resp->stat); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0867d5319fdb..bd0a385df3fc 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -834,12 +834,23 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } +static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len, + size_t expected) +{ + if (expected != 0 && len == 0) + return 1; + if (offset+len >= i_size_read(file_inode(file))) + return 1; + return 0; +} + static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count, int host_err) + unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { nfsdstats.io_read += host_err; + *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); trace_nfsd_read_io_done(rqstp, fhp, offset, *count); @@ -851,7 +862,8 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, unsigned long *count) + struct file *file, loff_t offset, unsigned long *count, + u32 *eof) { struct splice_desc sd = { .len = 0, @@ -859,25 +871,27 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, .pos = offset, .u.data = rqstp, }; - int host_err; + ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *count) + struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { struct iov_iter iter; - int host_err; + loff_t ppos = offset; + ssize_t host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); iov_iter_kvec(&iter, READ, vec, vlen, *count); - host_err = vfs_iter_read(file, &iter, &offset, 0); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + host_err = vfs_iter_read(file, &iter, &ppos, 0); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } /* @@ -984,7 +998,8 @@ out_nfserr: * N.B. After this call fhp needs an fh_put */ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) + loff_t offset, struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { struct nfsd_file *nf; struct file *file; @@ -997,9 +1012,9 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, file = nf->nf_file; if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - err = nfsd_splice_read(rqstp, fhp, file, offset, count); + err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); + err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof); nfsd_file_put(nf); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index e0f7792165a6..a13fd9d7e1f5 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -80,13 +80,16 @@ __be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, - loff_t, struct kvec *, int, unsigned long *); + loff_t, struct kvec *, int, unsigned long *, + u32 *eof); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *, int); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -149,23 +152,4 @@ static inline int nfsd_create_is_exclusive(int createmode) || createmode == NFS4_CREATE_EXCLUSIVE4_1; } -static inline bool nfsd_eof_on_read(long requested, long read, - loff_t offset, loff_t size) -{ - /* We assume a short read means eof: */ - if (requested > read) - return true; - /* - * A non-short read might also reach end of file. The spec - * still requires us to set eof in that case. - * - * Further operations may have modified the file size since - * the read, so the following check is not atomic with the read. - * We've only seen that cause a problem for a client in the case - * where the read returned a count of 0 without setting eof. - * That case was fixed by the addition of the above check. - */ - return (offset + read >= size); -} - #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 2cb29e961a76..99ff9f403ff1 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -151,7 +151,7 @@ struct nfsd3_readres { __be32 status; struct svc_fh fh; unsigned long count; - int eof; + __u32 eof; }; struct nfsd3_writeres { -- cgit v1.2.3