From 2fcc213a18644610c79edbb5e847d73c6c5d5ded Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 3 Aug 2015 13:04:26 -0400
Subject: xprtrdma: Fix large NFS SYMLINK calls

Repair how rpcrdma_marshal_req() chooses which RDMA message type
to use for large non-WRITE operations so that it picks RDMA_NOMSG
in the correct situations, and sets up the marshaling logic to
SEND only the RPC/RDMA header.

Large NFSv2 SYMLINK requests now use RDMA_NOMSG calls. The Linux NFS
server XDR decoder for NFSv2 SYMLINK does not handle having the
pathname argument arrive in a separate buffer. The decoder could be
fixed, but this is simpler and RDMA_NOMSG can be used in a variety
of other situations.

Ensure that the Linux client continues to use "RDMA_MSG + read
list" when sending large NFSv3 SYMLINK requests, which is more
efficient than using RDMA_NOMSG.

Large NFSv4 CREATE(NF4LNK) requests are changed to use "RDMA_MSG +
read list" just like NFSv3 (see Section 5 of RFC 5667). Before,
these did not work at all.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Devesh Sharma <devesh.sharma@avagotech.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs3xdr.c | 1 +
 fs/nfs/nfs4xdr.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9b04c2e6fffc..267126d32ec0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
 {
 	encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
 	encode_symlinkdata3(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
 }
 
 /*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 558cd65dbdb7..c42459e45f62 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1154,7 +1154,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	case NF4LNK:
 		p = reserve_space(xdr, 4);
 		*p = cpu_to_be32(create->u.symlink.len);
-		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+		xdr_write_pages(xdr, create->u.symlink.pages, 0,
+				create->u.symlink.len);
+		xdr->buf->flags |= XDRBUF_WRITE;
 		break;
 
 	case NF4BLK: case NF4CHR:
-- 
cgit v1.2.3


From 86d80f973434de24d8a807a92cd59d5ced7bd519 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 31 Jul 2015 16:24:30 -0400
Subject: NFSv4.1/pnfs: Fix atomicity of commit list updates

pnfs_layout_mark_request_commit() needs to ensure that it adds the
request to the commit list atomically with all the other updates
in order to prevent corruption to buckets[ds_commit_idx].wlseg
due to races with pnfs_generic_clear_request_commit().

Fixes: 338d00cfef07d ("pnfs: Refactor the *_layout_mark_request_commit...")
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h | 15 ++++++++++-----
 fs/nfs/pnfs_nfs.c |  5 +++--
 fs/nfs/write.c    | 29 ++++++++++++++++++++++++-----
 3 files changed, 37 insertions(+), 12 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9b372b845f6a..1dad18105ed0 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -490,6 +490,9 @@ void nfs_retry_commit(struct list_head *page_list,
 void nfs_commitdata_release(struct nfs_commit_data *data);
 void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 				 struct nfs_commit_info *cinfo);
+void nfs_request_add_commit_list_locked(struct nfs_page *req,
+		struct list_head *dst,
+		struct nfs_commit_info *cinfo);
 void nfs_request_remove_commit_list(struct nfs_page *req,
 				    struct nfs_commit_info *cinfo);
 void nfs_init_cinfo(struct nfs_commit_info *cinfo,
@@ -623,13 +626,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
  * Record the page as unstable and mark its inode as dirty.
  */
 static inline
-void nfs_mark_page_unstable(struct page *page)
+void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
 {
-	struct inode *inode = page_file_mapping(page)->host;
+	if (!cinfo->dreq) {
+		struct inode *inode = page_file_mapping(page)->host;
 
-	inc_zone_page_state(page, NR_UNSTABLE_NFS);
-	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
-	 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		inc_zone_page_state(page, NR_UNSTABLE_NFS);
+		inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	}
 }
 
 /*
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index f37e25b6311c..7a282876662f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -863,9 +863,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	}
 	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 	cinfo->ds->nwritten++;
-	spin_unlock(cinfo->lock);
 
-	nfs_request_add_commit_list(req, list, cinfo);
+	nfs_request_add_commit_list_locked(req, list, cinfo);
+	spin_unlock(cinfo->lock);
+	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75a35a1afa79..fdee9270ca15 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -767,6 +767,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
 	return NULL;
 }
 
+/**
+ * nfs_request_add_commit_list_locked - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must hold the cinfo->lock, and the nfs_page lock.
+ */
+void
+nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
+			    struct nfs_commit_info *cinfo)
+{
+	set_bit(PG_CLEAN, &req->wb_flags);
+	nfs_list_add_request(req, dst);
+	cinfo->mds->ncommit++;
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
+
 /**
  * nfs_request_add_commit_list - add request to a commit list
  * @req: pointer to a struct nfs_page
@@ -784,13 +806,10 @@ void
 nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 			    struct nfs_commit_info *cinfo)
 {
-	set_bit(PG_CLEAN, &(req)->wb_flags);
 	spin_lock(cinfo->lock);
-	nfs_list_add_request(req, dst);
-	cinfo->mds->ncommit++;
+	nfs_request_add_commit_list_locked(req, dst, cinfo);
 	spin_unlock(cinfo->lock);
-	if (!cinfo->dreq)
-		nfs_mark_page_unstable(req->wb_page);
+	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 
-- 
cgit v1.2.3


From d099d7b8316f3ebd63472d207c4801a464330016 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Mon, 10 Aug 2015 16:47:32 +0800
Subject: pnfs/flexfiles: LAYOUTSTATS ii_count should be ops instead of bytes

Turned out I misinterpreted the spec...

Cc: Tom Haynes <thomas.haynes@primarydata.com>
Reported-by: Jean Spector <jean@primarydata.com>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index b3289d701eea..ba74b15bf0ba 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1863,10 +1863,9 @@ ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
 		memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
 		devinfo->offset = pls->pls_range.offset;
 		devinfo->length = pls->pls_range.length;
-		/* well, we don't really know if IO is continuous or not! */
-		devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+		devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
 		devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
-		devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+		devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
 		devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
 		devinfo->layout_type = LAYOUT_FLEX_FILES;
 		devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
-- 
cgit v1.2.3


From c8ad8894e92b853df5a766061ee9cde7e10e682f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 5 Aug 2015 17:31:58 -0400
Subject: NFSv4.2/pnfs: Use GFP_NOIO for layoutstat reporting in the writeback
 path

Prevent a potential deadlock.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 6 ++++--
 fs/nfs/pnfs.c                          | 4 ++--
 fs/nfs/pnfs.h                          | 4 ++--
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ba74b15bf0ba..2a93bec7e6dd 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -510,7 +510,8 @@ nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
 	spin_unlock(&mirror->lock);
 
 	if (report)
-		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode,
+				GFP_KERNEL);
 }
 
 static void
@@ -538,7 +539,8 @@ nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
 	spin_unlock(&mirror->lock);
 
 	if (report)
-		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode,
+				GFP_NOIO);
 }
 
 static void
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 70bf706b1090..a6ec420983d1 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2267,7 +2267,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 
 #if IS_ENABLED(CONFIG_NFS_V4_2)
 int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
 {
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -2294,7 +2294,7 @@ pnfs_report_layoutstat(struct inode *inode)
 	pnfs_get_layout_hdr(hdr);
 	spin_unlock(&inode->i_lock);
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = kzalloc(sizeof(*data), gfp_flags);
 	if (!data) {
 		status = -ENOMEM;
 		goto out_put;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3e6ab7bfbabd..738672a0f8da 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -691,10 +691,10 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
 #endif /* CONFIG_NFS_V4_1 */
 
 #if IS_ENABLED(CONFIG_NFS_V4_2)
-int pnfs_report_layoutstat(struct inode *inode);
+int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags);
 #else
 static inline int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 5ef8d792fabedeb932375b23735bc7a1a3e8684d Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 30 Jul 2015 21:41:08 +0800
Subject: NFS: Error out when register_shrinker fail in register_nfs_fs

Commit 1d3d4437ea "vmscan: per-node deferred work" have made
register_shrinker can return an intergater error.

If register_shrinker() fail, the later unregister_shrinker() will
 cause a NULL pointer access.

v2, same as v1.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/super.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index aa62004f1706..383a027de452 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -381,9 +381,12 @@ int __init register_nfs_fs(void)
 	ret = nfs_register_sysctl();
 	if (ret < 0)
 		goto error_2;
-	register_shrinker(&acl_shrinker);
+	ret = register_shrinker(&acl_shrinker);
+	if (ret < 0)
+		goto error_3;
 	return 0;
-
+error_3:
+	nfs_unregister_sysctl();
 error_2:
 	unregister_nfs4_fs();
 error_1:
-- 
cgit v1.2.3


From efcbc04e16dfa95fef76309f89710dd1d99a5453 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 30 Jul 2015 13:00:56 +1000
Subject: NFSv4: don't set SETATTR for O_RDONLY|O_EXCL

It is unusual to combine the open flags O_RDONLY and O_EXCL, but
it appears that libre-office does just that.

[pid  3250] stat("/home/USER/.config", {st_mode=S_IFDIR|0700, st_size=8192, ...}) = 0
[pid  3250] open("/home/USER/.config/libreoffice/4-suse/user/extensions/buildid", O_RDONLY|O_EXCL <unfinished ...>

NFSv4 takes O_EXCL as a sign that a setattr command should be sent,
probably to reset the timestamps.

When it was an O_RDONLY open, the SETATTR command does not
identify any actual attributes to change.
If no delegation was provided to the open, the SETATTR uses the
all-zeros stateid and the request is accepted (at least by the
Linux NFS server - no harm, no foul).

If a read-delegation was provided, this is used in the SETATTR
request, and a Netapp filer will justifiably claim
NFS4ERR_BAD_STATEID, which the Linux client takes as a sign
to retry - indefinitely.

So only treat O_EXCL specially if O_CREAT was also given.

Signed-off-by: NeilBrown <neilb@suse.com>
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3acb1eb72930..15ee8bd99b61 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2425,7 +2425,7 @@ static int _nfs4_do_open(struct inode *dir,
 		goto err_free_label;
 	state = ctx->state;
 
-	if ((opendata->o_arg.open_flags & O_EXCL) &&
+	if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
 	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
 		nfs4_exclusive_attrset(opendata, sattr);
 
-- 
cgit v1.2.3


From 0847ef88c3c9318d85e92fc42369df0e0190e1ab Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 30 Jul 2015 21:40:06 +0800
Subject: NFS: Remove duplicate svc_xprt_put from nfs41_callback_up

The xprt created by svc_create_xprt have be added to serv->sv_permsocks.
So putting the xprt directly is useless.
Otherwise, there is a more svc_xprt_put after the xprt be freed.

v2, same as v1.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 682529c00996..6d27d7215a21 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv)
 	spin_lock_init(&serv->sv_cb_lock);
 	init_waitqueue_head(&serv->sv_cb_waitq);
 	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-	if (IS_ERR(rqstp)) {
-		svc_xprt_put(serv->sv_bc_xprt);
-		serv->sv_bc_xprt = NULL;
-	}
 	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
 	return rqstp;
 }
-- 
cgit v1.2.3


From 27571297a7e9a2a845c232813a7ba7e1227f5ec6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 3 Aug 2015 17:38:33 -0400
Subject: pNFS: Tighten up locking around DS commit buckets

I'm not aware of any bugreports around this issue, but the locking
around the pnfs_commit_bucket is inconsistent at best. This patch
tightens it up by ensuring that the 'bucket->committing' list is always
changed atomically w.r.t. the 'bucket->clseg' layout segment tracking.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs_nfs.c | 50 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 18 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7a282876662f..cd3c0403101b 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 	if (ret) {
 		cinfo->ds->nwritten -= ret;
 		cinfo->ds->ncommitting += ret;
-		bucket->clseg = bucket->wlseg;
-		if (list_empty(src))
+		if (bucket->clseg == NULL)
+			bucket->clseg = pnfs_get_lseg(bucket->wlseg);
+		if (list_empty(src)) {
+			pnfs_put_lseg_locked(bucket->wlseg);
 			bucket->wlseg = NULL;
-		else
-			pnfs_get_lseg(bucket->clseg);
+		}
 	}
 	return ret;
 }
@@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
 	struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
 	struct pnfs_commit_bucket *bucket;
 	struct pnfs_layout_segment *freeme;
+	LIST_HEAD(pages);
 	int i;
 
+	spin_lock(cinfo->lock);
 	for (i = idx; i < fl_cinfo->nbuckets; i++) {
 		bucket = &fl_cinfo->buckets[i];
 		if (list_empty(&bucket->committing))
 			continue;
-		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
-		spin_lock(cinfo->lock);
 		freeme = bucket->clseg;
 		bucket->clseg = NULL;
+		list_splice_init(&bucket->committing, &pages);
 		spin_unlock(cinfo->lock);
+		nfs_retry_commit(&pages, freeme, cinfo, i);
 		pnfs_put_lseg(freeme);
+		spin_lock(cinfo->lock);
 	}
+	spin_unlock(cinfo->lock);
 }
 
 static unsigned int
@@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
 		if (!data)
 			break;
 		data->ds_commit_index = i;
-		spin_lock(cinfo->lock);
-		data->lseg = bucket->clseg;
-		bucket->clseg = NULL;
-		spin_unlock(cinfo->lock);
 		list_add(&data->pages, list);
 		nreq++;
 	}
@@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
 	return nreq;
 }
 
+static inline
+void pnfs_fetch_commit_bucket_list(struct list_head *pages,
+		struct nfs_commit_data *data,
+		struct nfs_commit_info *cinfo)
+{
+	struct pnfs_commit_bucket *bucket;
+
+	bucket = &cinfo->ds->buckets[data->ds_commit_index];
+	spin_lock(cinfo->lock);
+	list_splice_init(pages, &bucket->committing);
+	data->lseg = bucket->clseg;
+	bucket->clseg = NULL;
+	spin_unlock(cinfo->lock);
+
+}
+
 /* This follows nfs_commit_list pretty closely */
 int
 pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -243,7 +260,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	if (!list_empty(mds_pages)) {
 		data = nfs_commitdata_alloc();
 		if (data != NULL) {
-			data->lseg = NULL;
+			data->ds_commit_index = -1;
 			list_add(&data->pages, &list);
 			nreq++;
 		} else {
@@ -265,19 +282,16 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 
 	list_for_each_entry_safe(data, tmp, &list, pages) {
 		list_del_init(&data->pages);
-		if (!data->lseg) {
+		if (data->ds_commit_index < 0) {
 			nfs_init_commit(data, mds_pages, NULL, cinfo);
 			nfs_initiate_commit(NFS_CLIENT(inode), data,
 					    NFS_PROTO(data->inode),
 					    data->mds_ops, how, 0);
 		} else {
-			struct pnfs_commit_bucket *buckets;
+			LIST_HEAD(pages);
 
-			buckets = cinfo->ds->buckets;
-			nfs_init_commit(data,
-					&buckets[data->ds_commit_index].committing,
-					data->lseg,
-					cinfo);
+			pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+			nfs_init_commit(data, &pages, data->lseg, cinfo);
 			initiate_commit(data, how);
 		}
 	}
-- 
cgit v1.2.3


From a4497a58e4043a925b7b308bd2c32f0744eca440 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 4 Aug 2015 15:41:50 -0400
Subject: NFSv4.1/pnfs: Remove redundant checks in pnfs_layoutgets_blocked()

If there are no valid layout segments, then we should already have
checked in pnfs_update_layout() whether or not this is the first
layoutget.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a6ec420983d1..8855b322d127 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -829,12 +829,10 @@ pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
 /* lget is set to 1 if called from inside send_layoutget call chain */
 static bool
 pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
-			struct pnfs_layout_range *range, int lget)
+			struct pnfs_layout_range *range)
 {
 	return lo->plh_block_lgets ||
 		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-		(list_empty(&lo->plh_segs) &&
-		 (atomic_read(&lo->plh_outstanding) > lget)) ||
 		pnfs_layout_returning(lo, range);
 }
 
@@ -847,7 +845,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 
 	dprintk("--> %s\n", __func__);
 	spin_lock(&lo->plh_inode->i_lock);
-	if (pnfs_layoutgets_blocked(lo, range, 1)) {
+	if (pnfs_layoutgets_blocked(lo, range)) {
 		status = -EAGAIN;
 	} else if (!nfs4_valid_open_stateid(open_state)) {
 		status = -EBADF;
@@ -1547,7 +1545,7 @@ lookup_again:
 		goto out_put_layout_hdr;
 	}
 
-	if (pnfs_layoutgets_blocked(lo, &arg, 0))
+	if (pnfs_layoutgets_blocked(lo, &arg))
 		goto out_unlock;
 	atomic_inc(&lo->plh_outstanding);
 	spin_unlock(&ino->i_lock);
@@ -1624,7 +1622,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 		goto out_forget_reply;
 	}
 
-	if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
+	if (pnfs_layoutgets_blocked(lo, &lgp->args.range)) {
 		dprintk("%s forget reply due to state\n", __func__);
 		goto out_forget_reply;
 	}
-- 
cgit v1.2.3


From 8f70f53a87007bdbb34c79d11178a153914f5db1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 4 Aug 2015 16:09:44 -0400
Subject: NFSv4.1/pnfs: Fix serialisation of layout return and layoutget

We should always test for outstanding layout returns, whether or not
pnfs_should_retry_layoutget() is true.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8855b322d127..e3a47ee15474 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1436,6 +1436,8 @@ static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
 
 static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
 {
+	if (!pnfs_should_retry_layoutget(lo))
+		return false;
 	/*
 	 * send layoutcommit as it can hold up layoutreturn due to lseg
 	 * reference
@@ -1531,8 +1533,7 @@ lookup_again:
 	 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
 	 * for LAYOUTRETURN even if first is true.
 	 */
-	if (!lseg && pnfs_should_retry_layoutget(lo) &&
-	    test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
 		spin_unlock(&ino->i_lock);
 		dprintk("%s wait for layoutreturn\n", __func__);
 		if (pnfs_prepare_to_retry_layoutget(lo)) {
-- 
cgit v1.2.3


From 5c4a79fb2b1cd80cb58986f6acf402721901c545 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 4 Aug 2015 15:57:13 -0400
Subject: NFSv4.1/pnfs: Don't prevent layoutgets when doing return-on-close

If there is an outstanding return-on-close, then we just want new
layoutget requests to wait rather than fail.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e3a47ee15474..8c5f9f59efbb 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1105,7 +1105,9 @@ bool pnfs_roc(struct inode *ino)
 		}
 	if (!found)
 		goto out_noroc;
-	lo->plh_block_lgets++;
+	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+		goto out_noroc;
+	lo->plh_return_iomode = IOMODE_ANY;
 	pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&tmp_list);
@@ -1133,7 +1135,7 @@ void pnfs_roc_release(struct inode *ino)
 
 	spin_lock(&ino->i_lock);
 	lo = NFS_I(ino)->layout;
-	lo->plh_block_lgets--;
+	pnfs_clear_layoutreturn_waitbit(lo);
 	if (atomic_dec_and_test(&lo->plh_refcount)) {
 		pnfs_detach_layout_hdr(lo);
 		spin_unlock(&ino->i_lock);
-- 
cgit v1.2.3


From 2d8ae84fbc32a14bba176cf9c20d5eb2a3d42791 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 4 Aug 2015 16:15:48 -0400
Subject: NFSv4.1/pnfs: Remove redundant lo->plh_block_lgets in layoutreturn

The NFS_LAYOUT_RETURN bit already suffices to ensure that layoutget
is blocked.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8c5f9f59efbb..ada07376a6c7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -368,7 +368,6 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
 	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 		return false;
 	lo->plh_return_iomode = 0;
-	lo->plh_block_lgets++;
 	pnfs_get_layout_hdr(lo);
 	clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
 	return true;
@@ -954,7 +953,6 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
 	if (unlikely(lrp == NULL)) {
 		status = -ENOMEM;
 		spin_lock(&ino->i_lock);
-		lo->plh_block_lgets--;
 		pnfs_clear_layoutreturn_waitbit(lo);
 		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
 		spin_unlock(&ino->i_lock);
-- 
cgit v1.2.3


From e1c06f80dcca54cd323d1b98eb385a2c05c6e06b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 4 Aug 2015 16:40:08 -0400
Subject: NFSv4.1/pnfs: Remove redundant check in pnfs_layoutgets_blocked()

layoutget now should already be serialised w.r.t. layout returns

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ada07376a6c7..037126a8c216 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -816,23 +816,12 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
 	return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 }
 
-static bool
-pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
-		      struct pnfs_layout_range *range)
-{
-	return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
-		(lo->plh_return_iomode == IOMODE_ANY ||
-		 lo->plh_return_iomode == range->iomode);
-}
-
 /* lget is set to 1 if called from inside send_layoutget call chain */
 static bool
-pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
-			struct pnfs_layout_range *range)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 {
 	return lo->plh_block_lgets ||
-		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-		pnfs_layout_returning(lo, range);
+		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 }
 
 int
@@ -844,7 +833,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 
 	dprintk("--> %s\n", __func__);
 	spin_lock(&lo->plh_inode->i_lock);
-	if (pnfs_layoutgets_blocked(lo, range)) {
+	if (pnfs_layoutgets_blocked(lo)) {
 		status = -EAGAIN;
 	} else if (!nfs4_valid_open_stateid(open_state)) {
 		status = -EBADF;
@@ -1546,7 +1535,7 @@ lookup_again:
 		goto out_put_layout_hdr;
 	}
 
-	if (pnfs_layoutgets_blocked(lo, &arg))
+	if (pnfs_layoutgets_blocked(lo))
 		goto out_unlock;
 	atomic_inc(&lo->plh_outstanding);
 	spin_unlock(&ino->i_lock);
@@ -1618,12 +1607,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	lseg->pls_range = res->range;
 
 	spin_lock(&ino->i_lock);
-	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-		dprintk("%s forget reply due to recall\n", __func__);
-		goto out_forget_reply;
-	}
-
-	if (pnfs_layoutgets_blocked(lo, &lgp->args.range)) {
+	if (pnfs_layoutgets_blocked(lo)) {
 		dprintk("%s forget reply due to state\n", __func__);
 		goto out_forget_reply;
 	}
-- 
cgit v1.2.3


From 58830550f009c5f60f702c9d3021f8c3be0012b0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 4 Aug 2015 17:18:10 -0400
Subject: NFSv4.1/pnfs: Remove redundant wakeup in pnfs_send_layoutreturn()

pnfs_clear_layoutreturn_waitbit() should already be calling
rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq) for us.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 037126a8c216..6151f39c8291 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -943,7 +943,6 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
 		status = -ENOMEM;
 		spin_lock(&ino->i_lock);
 		pnfs_clear_layoutreturn_waitbit(lo);
-		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
 		spin_unlock(&ino->i_lock);
 		pnfs_put_layout_hdr(lo);
 		goto out;
-- 
cgit v1.2.3


From e9ae58aeee8842a50f7e199d602a5ccb2e41a95f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Aug 2015 12:57:07 -0500
Subject: NFS: nfs_set_pgio_error sometimes misses errors

We should ensure that we always set the pgio_header's error field
if a READ or WRITE RPC call returns an error. The current code depends
on 'hdr->good_bytes' always being initialised to a large value, which
is not always done correctly by callers.
When this happens, applications may end up missing important errors.

Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4984bbe55ff1..7c5718ba625e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
 {
 	spin_lock(&hdr->lock);
-	if (pos < hdr->io_start + hdr->good_bytes) {
-		set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+	if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
+	    || pos < hdr->io_start + hdr->good_bytes) {
 		clear_bit(NFS_IOHDR_EOF, &hdr->flags);
 		hdr->good_bytes = pos - hdr->io_start;
 		hdr->error = error;
-- 
cgit v1.2.3


From 6f536936b79bd4b5cea8fb0e5b8b0bce8cd1ea4a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 13 Aug 2015 10:59:07 -0400
Subject: NFSv4.1/pNFS: Fix borken function _same_data_server_addrs_locked()

- Switch back to using list_for_each_entry(). Fixes an incorrect test
  for list NULL termination.
- Do not assume that lists are sorted.
- Finally, consider an existing entry to match if it consists of a subset
  of the addresses in the new entry.

Cc: stable@vger.kernel.org # 4.0+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs_nfs.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7a282876662f..e5c679f04099 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -359,26 +359,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
 	return false;
 }
 
+/*
+ * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
+ * declare a match.
+ */
 static bool
 _same_data_server_addrs_locked(const struct list_head *dsaddrs1,
 			       const struct list_head *dsaddrs2)
 {
 	struct nfs4_pnfs_ds_addr *da1, *da2;
-
-	/* step through both lists, comparing as we go */
-	for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
-	     da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
-	     da1 != NULL && da2 != NULL;
-	     da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
-	     da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
-		if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
-				   (struct sockaddr *)&da2->da_addr))
-			return false;
+	struct sockaddr *sa1, *sa2;
+	bool match = false;
+
+	list_for_each_entry(da1, dsaddrs1, da_node) {
+		sa1 = (struct sockaddr *)&da1->da_addr;
+		match = false;
+		list_for_each_entry(da2, dsaddrs2, da_node) {
+			sa2 = (struct sockaddr *)&da2->da_addr;
+			match = same_sockaddr(sa1, sa2);
+			if (match)
+				break;
+		}
+		if (!match)
+			break;
 	}
-	if (da1 == NULL && da2 == NULL)
-		return true;
-
-	return false;
+	return match;
 }
 
 /*
-- 
cgit v1.2.3


From 18e3b739fdc826481c6a1335ce0c5b19b3d415da Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Sat, 15 Aug 2015 21:52:10 +0800
Subject: NFS: Fix a NULL pointer dereference of migration recovery ops for
 v4.2 client

---Steps to Reproduce--
<nfs-server>
# cat /etc/exports
/nfs/referal  *(rw,insecure,no_subtree_check,no_root_squash,crossmnt)
/nfs/old      *(ro,insecure,subtree_check,root_squash,crossmnt)

<nfs-client>
# mount -t nfs nfs-server:/nfs/ /mnt/
# ll /mnt/*/

<nfs-server>
# cat /etc/exports
/nfs/referal   *(rw,insecure,no_subtree_check,no_root_squash,crossmnt,refer=/nfs/old/@nfs-server)
/nfs/old       *(ro,insecure,subtree_check,root_squash,crossmnt)
# service nfs restart

<nfs-client>
# ll /mnt/*/    --->>>>> oops here

[ 5123.102925] BUG: unable to handle kernel NULL pointer dereference at           (null)
[ 5123.103363] IP: [<ffffffffa03ed38b>] nfs4_proc_get_locations+0x9b/0x120 [nfsv4]
[ 5123.103752] PGD 587b9067 PUD 3cbf5067 PMD 0
[ 5123.104131] Oops: 0000 [#1]
[ 5123.104529] Modules linked in: nfsv4(OE) nfs(OE) fscache(E) nfsd(OE) xfs libcrc32c iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi coretemp crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ppdev vmw_balloon parport_pc parport i2c_piix4 shpchp auth_rpcgss nfs_acl vmw_vmci lockd grace sunrpc vmwgfx drm_kms_helper ttm drm mptspi serio_raw scsi_transport_spi e1000 mptscsih mptbase ata_generic pata_acpi [last unloaded: nfsd]
[ 5123.105887] CPU: 0 PID: 15853 Comm: ::1-manager Tainted: G           OE   4.2.0-rc6+ #214
[ 5123.106358] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/20/2014
[ 5123.106860] task: ffff88007620f300 ti: ffff88005877c000 task.ti: ffff88005877c000
[ 5123.107363] RIP: 0010:[<ffffffffa03ed38b>]  [<ffffffffa03ed38b>] nfs4_proc_get_locations+0x9b/0x120 [nfsv4]
[ 5123.107909] RSP: 0018:ffff88005877fdb8  EFLAGS: 00010246
[ 5123.108435] RAX: ffff880053f3bc00 RBX: ffff88006ce6c908 RCX: ffff880053a0d240
[ 5123.108968] RDX: ffffea0000e6d940 RSI: ffff8800399a0000 RDI: ffff88006ce6c908
[ 5123.109503] RBP: ffff88005877fe28 R08: ffffffff81c708a0 R09: 0000000000000000
[ 5123.110045] R10: 00000000000001a2 R11: ffff88003ba7f5c8 R12: ffff880054c55800
[ 5123.110618] R13: 0000000000000000 R14: ffff880053a0d240 R15: ffff880053a0d240
[ 5123.111169] FS:  0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000
[ 5123.111726] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5123.112286] CR2: 0000000000000000 CR3: 0000000054cac000 CR4: 00000000001406f0
[ 5123.112888] Stack:
[ 5123.113458]  ffffea0000e6d940 ffff8800399a0000 00000000000167d0 0000000000000000
[ 5123.114049]  0000000000000000 0000000000000000 0000000000000000 00000000a7ec82c6
[ 5123.114662]  ffff88005877fe18 ffffea0000e6d940 ffff8800399a0000 ffff880054c55800
[ 5123.115264] Call Trace:
[ 5123.115868]  [<ffffffffa03fb44b>] nfs4_try_migration+0xbb/0x220 [nfsv4]
[ 5123.116487]  [<ffffffffa03fcb3b>] nfs4_run_state_manager+0x4ab/0x7b0 [nfsv4]
[ 5123.117104]  [<ffffffffa03fc690>] ? nfs4_do_reclaim+0x510/0x510 [nfsv4]
[ 5123.117813]  [<ffffffff810a4527>] kthread+0xd7/0xf0
[ 5123.118456]  [<ffffffff810a4450>] ? kthread_worker_fn+0x160/0x160
[ 5123.119108]  [<ffffffff816d9cdf>] ret_from_fork+0x3f/0x70
[ 5123.119723]  [<ffffffff810a4450>] ? kthread_worker_fn+0x160/0x160
[ 5123.120329] Code: 4c 8b 6a 58 74 17 eb 52 48 8d 55 a8 89 c6 4c 89 e7 e8 4a b5 ff ff 8b 45 b0 85 c0 74 1c 4c 89 f9 48 8b 55 90 48 8b 75 98 48 89 df <41> ff 55 00 3d e8 d8 ff ff 41 89 c6 74 cf 48 8b 4d c8 65 48 33
[ 5123.121643] RIP  [<ffffffffa03ed38b>] nfs4_proc_get_locations+0x9b/0x120 [nfsv4]
[ 5123.122308]  RSP <ffff88005877fdb8>
[ 5123.122942] CR2: 0000000000000000

Fixes: ec011fe847 ("NFS: Introduce a vector of migration recovery ops")
Cc: stable@vger.kernel.org # v3.13+
Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 15ee8bd99b61..43bace809be7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8661,6 +8661,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
 	.state_renewal_ops = &nfs41_state_renewal_ops,
+	.mig_recovery_ops = &nfs41_mig_recovery_ops,
 };
 #endif
 
-- 
cgit v1.2.3


From 29662fa646b41492a9c298a83399126f94847e93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2015 18:40:57 +0200
Subject: pnfs/blocklayout: calculate layoutupdate size correctly

We need to include the first u32 for the number of entries.  Add a helper
for the calculation instead of opencoding it so that it's in one place.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/extent_tree.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 31d0b5e53dfd..7536036fb526 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -462,6 +462,12 @@ out:
 	return err;
 }
 
+static size_t ext_tree_layoutupdate_size(size_t count)
+{
+	return sizeof(__be32) /* number of entries */ +
+		BL_EXTENT_SIZE * count;
+}
+
 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 		size_t buffer_size)
 {
@@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 			continue;
 
 		(*count)++;
-		if (*count * BL_EXTENT_SIZE > buffer_size) {
+		if (ext_tree_layoutupdate_size(*count) > buffer_size) {
 			/* keep counting.. */
 			ret = -ENOSPC;
 			continue;
@@ -530,7 +536,7 @@ retry:
 	if (unlikely(ret)) {
 		ext_tree_free_commitdata(arg, buffer_size);
 
-		buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+		buffer_size = ext_tree_layoutupdate_size(count);
 		count = 0;
 
 		arg->layoutupdate_pages =
@@ -549,7 +555,7 @@ retry:
 	}
 
 	*start_p = cpu_to_be32(count);
-	arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+	arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
 
 	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
 		__be32 *p = start_p;
-- 
cgit v1.2.3


From 68596bd188e5e621c28a2f6fc0a3dd80a606d16b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2015 18:40:58 +0200
Subject: pnfs/blocklayout: set up layoutupdate_pages properly

We need to replace the __be32 with a void pointer to do proper arithmentics
on the virtual addresses so that we can get the right page pointers.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/extent_tree.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 7536036fb526..a11b759d294a 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -558,14 +558,11 @@ retry:
 	arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
 
 	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
-		__be32 *p = start_p;
+		void *p = start_p, *end = p + arg->layoutupdate_len;
 		int i = 0;
 
-		for (p = start_p;
-		     p < start_p + arg->layoutupdate_len;
-		     p += PAGE_SIZE) {
+		for ( ; p < end; p += PAGE_SIZE)
 			arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
-		}
 	}
 
 	dprintk("%s found %zu ranges\n", __func__, count);
-- 
cgit v1.2.3


From 2bd3c63a333c364cfde4701b51f82e1fae106a88 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2015 18:40:59 +0200
Subject: pnfs/blocklayout: reject too long signatures

Instead of overwriting kernel memory reject too long signatures.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/dev.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e535599a0719..d76993a42432 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 				return -EIO;
 			p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
 			b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+			if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
+				pr_info("signature too long: %d\n",
+					b->simple.sigs[i].sig_len);
+				return -EIO;
+			}
 
 			p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
 			if (!p)
-- 
cgit v1.2.3


From 513d6d7a9591d7c5ebb5f05465942185db9299a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2015 18:41:00 +0200
Subject: pnfs/blocklayout: pass proper file mode to blkdev_get/put

We generally want to read and write to a block device that's used by
the pNFS block layout client (and even if it's read only the server
has no way of telling us).  Add FMODE_WRITE to the mode argument
so that we don't incorrectly tell the block driver that we want a
read-only open.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index d76993a42432..a861bbdfe577 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev)
 		kfree(dev->children);
 	} else {
 		if (dev->bdev)
-			blkdev_put(dev->bdev, FMODE_READ);
+			blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
 	}
 }
 
@@ -200,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
 	if (!dev)
 		return -EIO;
 
-	d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+	d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
 	if (IS_ERR(d->bdev)) {
 		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
 			MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
-- 
cgit v1.2.3


From 8bb28975823aee062f82b99ddacc499601c0cfd1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Aug 2015 18:41:01 +0200
Subject: pnfs: move common blocklayout XDR defintions to nfs4.h

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.h | 19 +------------------
 fs/nfs/blocklayout/extent_tree.c |  2 +-
 fs/nfsd/blocklayoutxdr.c         |  2 +-
 fs/nfsd/blocklayoutxdr.h         | 15 ---------------
 include/linux/nfs4.h             | 18 ++++++++++++++++++
 5 files changed, 21 insertions(+), 35 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 92dca9e90d8d..c556640dcf3b 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -46,13 +46,6 @@
 
 struct pnfs_block_dev;
 
-enum pnfs_block_volume_type {
-	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
-	PNFS_BLOCK_VOLUME_SLICE		= 1,
-	PNFS_BLOCK_VOLUME_CONCAT	= 2,
-	PNFS_BLOCK_VOLUME_STRIPE	= 3,
-};
-
 #define PNFS_BLOCK_MAX_UUIDS	4
 #define PNFS_BLOCK_MAX_DEVICES	64
 
@@ -117,13 +110,6 @@ struct pnfs_block_dev {
 			struct pnfs_block_dev_map *map);
 };
 
-enum exstate4 {
-	PNFS_BLOCK_READWRITE_DATA	= 0,
-	PNFS_BLOCK_READ_DATA		= 1,
-	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */
-	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */
-};
-
 /* sector_t fields are all in 512-byte sectors */
 struct pnfs_block_extent {
 	union {
@@ -134,15 +120,12 @@ struct pnfs_block_extent {
 	sector_t	be_f_offset;	/* the starting offset in the file */
 	sector_t	be_length;	/* the size of the extent */
 	sector_t	be_v_offset;	/* the starting offset in the volume */
-	enum exstate4	be_state;	/* the state of this extent */
+	enum pnfs_block_extent_state be_state;	/* the state of this extent */
 #define EXTENT_WRITTEN		1
 #define EXTENT_COMMITTING	2
 	unsigned int	be_tag;
 };
 
-/* on the wire size of the extent */
-#define BL_EXTENT_SIZE	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
-
 struct pnfs_block_layout {
 	struct pnfs_layout_hdr	bl_layout;
 	struct rb_root		bl_ext_rw;
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index a11b759d294a..c59a59c37f3d 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -465,7 +465,7 @@ out:
 static size_t ext_tree_layoutupdate_size(size_t count)
 {
 	return sizeof(__be32) /* number of entries */ +
-		BL_EXTENT_SIZE * count;
+		PNFS_BLOCK_EXTENT_SIZE * count;
 }
 
 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 9aa2796da90d..6d834dc9bbc8 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 	}
 
 	nr_iomaps = be32_to_cpup(p++);
-	expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+	expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
 	if (len != expected) {
 		dprintk("%s: extent array size mismatch: %u/%u\n",
 			__func__, len, expected);
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index fdc79037c0e7..6de925fe8499 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -7,13 +7,6 @@
 struct iomap;
 struct xdr_stream;
 
-enum pnfs_block_extent_state {
-	PNFS_BLOCK_READWRITE_DATA	= 0,
-	PNFS_BLOCK_READ_DATA		= 1,
-	PNFS_BLOCK_INVALID_DATA		= 2,
-	PNFS_BLOCK_NONE_DATA		= 3,
-};
-
 struct pnfs_block_extent {
 	struct nfsd4_deviceid		vol_id;
 	u64				foff;
@@ -21,14 +14,6 @@ struct pnfs_block_extent {
 	u64				soff;
 	enum pnfs_block_extent_state	es;
 };
-#define NFS4_BLOCK_EXTENT_SIZE		44
-
-enum pnfs_block_volume_type {
-	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
-	PNFS_BLOCK_VOLUME_SLICE		= 1,
-	PNFS_BLOCK_VOLUME_CONCAT	= 2,
-	PNFS_BLOCK_VOLUME_STRIPE	= 3,
-};
 
 /*
  * Random upper cap for the uuid length to avoid unbounded allocation.
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b8e72aad919c..00121f298269 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -547,6 +547,24 @@ enum pnfs_notify_deviceid_type4 {
 	NOTIFY_DEVICEID4_DELETE = 1 << 2,
 };
 
+enum pnfs_block_volume_type {
+	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
+	PNFS_BLOCK_VOLUME_SLICE		= 1,
+	PNFS_BLOCK_VOLUME_CONCAT	= 2,
+	PNFS_BLOCK_VOLUME_STRIPE	= 3,
+};
+
+enum pnfs_block_extent_state {
+	PNFS_BLOCK_READWRITE_DATA	= 0,
+	PNFS_BLOCK_READ_DATA		= 1,
+	PNFS_BLOCK_INVALID_DATA		= 2,
+	PNFS_BLOCK_NONE_DATA		= 3,
+};
+
+/* on the wire size of a block layout extent */
+#define PNFS_BLOCK_EXTENT_SIZE \
+	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
+
 #define NFL4_UFLG_MASK			0x0000003F
 #define NFL4_UFLG_DENSE			0x00000001
 #define NFL4_UFLG_COMMIT_THRU_MDS	0x00000002
-- 
cgit v1.2.3


From ce603281468cf1be0a70a45fdaef761db134b900 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Fri, 10 Jul 2015 15:59:26 -0400
Subject: nfs: remove some dead code in ff_layout_pg_get_mirror_count_write

We already know that pg_lseg is NULL here.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 2a93bec7e6dd..13fe64b4e259 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -730,8 +730,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
 	/* no lseg means that pnfs is not in use, so no mirroring here */
-	pnfs_put_lseg(pgio->pg_lseg);
-	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_write_mds(pgio);
 	return 1;
 }
-- 
cgit v1.2.3


From 0b936e37df8111ea727263a5a246b58e05b7aa1b Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:24 -0400
Subject: NFS: Remove unused variable "pages_ptr"

This variable is initialized to NULL and is never modified before being
passed to nfs_readdir_free_large_page().  But that's okay, because
nfs_readdir_free_large_page() only seems to exist as a way of calling
nfs_readdir_free_pagearray() without this parameter.  Let's simplify by
removing pages_ptr and nfs_readdir_free_pagearray().

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/dir.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 547308a5ec6f..26c5d63cfc3d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -590,16 +590,9 @@ void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
 		put_page(pages[i]);
 }
 
-static
-void nfs_readdir_free_large_page(void *ptr, struct page **pages,
-		unsigned int npages)
-{
-	nfs_readdir_free_pagearray(pages, npages);
-}
-
 /*
  * nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_large_page
+ * to nfs_readdir_free_pagearray
  */
 static
 int nfs_readdir_large_page(struct page **pages, unsigned int npages)
@@ -623,7 +616,6 @@ static
 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 {
 	struct page *pages[NFS_MAX_READDIR_PAGES];
-	void *pages_ptr = NULL;
 	struct nfs_entry entry;
 	struct file	*file = desc->file;
 	struct nfs_cache_array *array;
@@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		}
 	} while (array->eof_index < 0);
 
-	nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+	nfs_readdir_free_pagearray(pages, array_size);
 out_release_array:
 	nfs_readdir_release_array(page);
 out_label_free:
-- 
cgit v1.2.3


From c7e9668e78eab69b7ade9897bea1fbd77dd18775 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:25 -0400
Subject: NFS: Rename nfs_readdir_free_pagearray() and nfs_readdir_large_page()

nfs_readdir_xdr_to_array() uses both a cache array and an array of
pages, so I rename these functions to make it clearer how the code
works.  nfs_readdir_large_page() becomes nfs_readdir_alloc_pages()
because this function has absolutely nothing to do with setting up a
large page.  nfs_readdir_free_pagearray() becomes
nfs_readdir_free_pages() to stay consistent with the new alloc_pages()
function.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/dir.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 26c5d63cfc3d..3d8e4ffa0a33 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -583,7 +583,7 @@ out_nopages:
 }
 
 static
-void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
 {
 	unsigned int i;
 	for (i = 0; i < npages; i++)
@@ -595,7 +595,7 @@ void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
  * to nfs_readdir_free_pagearray
  */
 static
-int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
 {
 	unsigned int i;
 
@@ -608,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages)
 	return 0;
 
 out_freepages:
-	nfs_readdir_free_pagearray(pages, i);
+	nfs_readdir_free_pages(pages, i);
 	return -ENOMEM;
 }
 
@@ -645,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	memset(array, 0, sizeof(struct nfs_cache_array));
 	array->eof_index = -1;
 
-	status = nfs_readdir_large_page(pages, array_size);
+	status = nfs_readdir_alloc_pages(pages, array_size);
 	if (status < 0)
 		goto out_release_array;
 	do {
@@ -663,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		}
 	} while (array->eof_index < 0);
 
-	nfs_readdir_free_pagearray(pages, array_size);
+	nfs_readdir_free_pages(pages, array_size);
 out_release_array:
 	nfs_readdir_release_array(page);
 out_label_free:
-- 
cgit v1.2.3


From d8efa4e62505f5113e363572b5438b7be0d08b12 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:28 -0400
Subject: NFS: Use RPC functions for matching sockaddrs

They already exist and do the exact same thing.  Let's save ourselves
several lines of code!

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c     | 113 +---------------------------------------------------
 fs/nfs/internal.h   |   4 --
 fs/nfs/nfs4client.c |   5 +--
 3 files changed, 3 insertions(+), 119 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4a90c9bb3135..57c5a02f6213 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -20,6 +20,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
@@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs_put_client);
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/*
- * Test if two ip6 socket addresses refer to the same socket by
- * comparing relevant fields. The padding bytes specifically, are not
- * compared. sin6_flowinfo is not compared because it only affects QoS
- * and sin6_scope_id is only compared if the address is "link local"
- * because "link local" addresses need only be unique to a specific
- * link. Conversely, ordinary unicast addresses might have different
- * sin6_scope_id.
- *
- * The caller should ensure both socket addresses are AF_INET6.
- */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-				      const struct sockaddr *sa2)
-{
-	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
-	if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
-		return 0;
-	else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
-		return sin1->sin6_scope_id == sin2->sin6_scope_id;
-
-	return 1;
-}
-#else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-				      const struct sockaddr *sa2)
-{
-	return 0;
-}
-#endif
-
-/*
- * Test if two ip4 socket addresses refer to the same socket, by
- * comparing relevant fields. The padding bytes specifically, are
- * not compared.
- *
- * The caller should ensure both socket addresses are AF_INET.
- */
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
-				      const struct sockaddr *sa2)
-{
-	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
-	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
-}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
-				const struct sockaddr *sa2)
-{
-	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
-	return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
-		(sin1->sin6_port == sin2->sin6_port);
-}
-
-static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
-				const struct sockaddr *sa2)
-{
-	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
-	return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
-		(sin1->sin_port == sin2->sin_port);
-}
-
-#if defined(CONFIG_NFS_V4_1)
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, excluding the port number.
- */
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-			      const struct sockaddr *sa2)
-{
-	if (sa1->sa_family != sa2->sa_family)
-		return 0;
-
-	switch (sa1->sa_family) {
-	case AF_INET:
-		return nfs_sockaddr_match_ipaddr4(sa1, sa2);
-	case AF_INET6:
-		return nfs_sockaddr_match_ipaddr6(sa1, sa2);
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr);
-#endif /* CONFIG_NFS_V4_1 */
-
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, including the port number.
- */
-static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
-			    const struct sockaddr *sa2)
-{
-	if (sa1->sa_family != sa2->sa_family)
-		return 0;
-
-	switch (sa1->sa_family) {
-	case AF_INET:
-		return nfs_sockaddr_cmp_ip4(sa1, sa2);
-	case AF_INET6:
-		return nfs_sockaddr_cmp_ip6(sa1, sa2);
-	}
-	return 0;
-}
-
 /*
  * Find an nfs_client on the list that matches the initialisation data
  * that is supplied.
@@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 		if (clp->cl_minorversion != data->minorversion)
 			continue;
 		/* Match the full socket address */
-		if (!nfs_sockaddr_cmp(sap, clap))
+		if (!rpc_cmp_addr_port(sap, clap))
 			continue;
 
 		atomic_inc(&clp->cl_count);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 1dad18105ed0..9ab3b1c21bb4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void)
 }
 #endif
 
-#ifdef CONFIG_NFS_V4_1
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
-#endif
-
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3aa6a9ba5113..223bedda64ae 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -729,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
 		return false;
 
 	/* Match only the IP address, not the port number */
-	if (!nfs_sockaddr_match_ipaddr(addr, clap))
-		return false;
-
-	return true;
+	return rpc_cmp_addr(addr, clap);
 }
 
 /*
-- 
cgit v1.2.3


From fb2a525cf086bb080f84b0b748ee823ef79e539c Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:29 -0400
Subject: NFS: Combine nfs_idmap_{init|quit}() and
 nfs_idmap_{init|quit}_keyring()

The idmap_init() and idmap_quit() functions only exist to call the
_keyring() version.  Let's just call the keyring() functions directly.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4idmap.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 535dfc69c628..2e4902203c35 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = {
 	.read		= user_read,
 };
 
-static int nfs_idmap_init_keyring(void)
+int nfs_idmap_init(void)
 {
 	struct cred *cred;
 	struct key *keyring;
@@ -230,7 +230,7 @@ failed_put_cred:
 	return ret;
 }
 
-static void nfs_idmap_quit_keyring(void)
+void nfs_idmap_quit(void)
 {
 	key_revoke(id_resolver_cache->thread_keyring);
 	unregister_key_type(&key_type_id_resolver);
@@ -492,16 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp)
 	kfree(idmap);
 }
 
-int nfs_idmap_init(void)
-{
-	return nfs_idmap_init_keyring();
-}
-
-void nfs_idmap_quit(void)
-{
-	nfs_idmap_quit_keyring();
-}
-
 static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
 				     struct idmap_msg *im,
 				     struct rpc_pipe_msg *msg)
-- 
cgit v1.2.3


From 3f10a6af4b577da2117907ac8420f27af81d57de Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:31 -0400
Subject: NFS: Remove nfs41_server_notify_{target|highest}_slotid_update()

All these functions do is call nfs41_ping_server() without adding
anything.  Let's remove them and give nfs41_ping_server() a better name
instead.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c |  2 +-
 fs/nfs/nfs4_fs.h       |  4 +---
 fs/nfs/nfs4proc.c      |  2 +-
 fs/nfs/nfs4state.c     | 12 +-----------
 4 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 29e3c1b011b7..624bef79ba7c 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -554,7 +554,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
 	status = htonl(NFS4_OK);
 
 	nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
-	nfs41_server_notify_target_slotid_update(cps->clp);
+	nfs41_notify_server(cps->clp);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea3bee919a76..50cfc4ca7a02 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -405,9 +405,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
 int nfs41_discover_server_trunking(struct nfs_client *clp,
 			struct nfs_client **, struct rpc_cred *);
 extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
-extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
-extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
-
+extern void nfs41_notify_server(struct nfs_client *);
 #else
 static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 43bace809be7..f4e5816a77b0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -586,7 +586,7 @@ out_unlock:
 	spin_unlock(&tbl->slot_tbl_lock);
 	res->sr_slot = NULL;
 	if (send_new_highest_used_slotid)
-		nfs41_server_notify_highest_slotid_update(session->clp);
+		nfs41_notify_server(session->clp);
 }
 
 int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f2e2ad894461..da73bc443238 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2152,23 +2152,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 
-static void nfs41_ping_server(struct nfs_client *clp)
+void nfs41_notify_server(struct nfs_client *clp)
 {
 	/* Use CHECK_LEASE to ping the server with a SEQUENCE */
 	set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
 	nfs4_schedule_state_manager(clp);
 }
 
-void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
-{
-	nfs41_ping_server(clp);
-}
-
-void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
-{
-	nfs41_ping_server(clp);
-}
-
 static void nfs4_reset_all_state(struct nfs_client *clp)
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
-- 
cgit v1.2.3


From ae09c31f66cc673bb8c64a5dbfdc04ab67f66d7e Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:32 -0400
Subject: NFS: Rename nfs_commit_unstable_pages() to nfs_write_inode()

All nfs_write_inode() does is pass its arguments to
nfs_commit_unstable_pages().  Let's cut out the middle man and have
nfs_write_pages() do the work directly.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fdee9270ca15..388f48079c43 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1812,7 +1812,7 @@ out_mark_dirty:
 	return res;
 }
 
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int flags = FLUSH_SYNC;
@@ -1847,11 +1847,6 @@ out_mark_dirty:
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 }
-
-int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	return nfs_commit_unstable_pages(inode, wbc);
-}
 EXPORT_SYMBOL_GPL(nfs_write_inode);
 
 /*
-- 
cgit v1.2.3


From aff8d8dc4c34804c6a1de04f1b3313aa9063bf46 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 13 Jul 2015 14:01:33 -0400
Subject: NFS: Remove nfs_release()

And call nfs_file_clear_open_context() directly.  This makes it obvious
that nfs_file_release() will always return 0.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c          | 3 ++-
 fs/nfs/inode.c         | 8 +-------
 include/linux/nfs_fs.h | 2 +-
 3 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cc4fa1ed61fc..7538a8582384 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp)
 	dprintk("NFS: release(%pD2)\n", filp);
 
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
-	return nfs_release(inode, filp);
+	nfs_file_clear_open_context(filp);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_file_release);
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0adc7d245b3d..382c8a46c078 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -888,7 +888,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 	return ctx;
 }
 
-static void nfs_file_clear_open_context(struct file *filp)
+void nfs_file_clear_open_context(struct file *filp)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(filp);
 
@@ -919,12 +919,6 @@ int nfs_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int nfs_release(struct inode *inode, struct file *filp)
-{
-	nfs_file_clear_open_context(filp);
-	return 0;
-}
-
 /*
  * This function is called whenever some part of NFS notices that
  * the cached attributes have to be refreshed.
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 874b77228fb9..c0e961474a52 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -353,7 +353,6 @@ extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
-extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
@@ -371,6 +370,7 @@ extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struc
 extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode);
 extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx);
 extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
+extern void nfs_file_clear_open_context(struct file *flip);
 extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
 extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
 extern u64 nfs_compat_user_ino64(u64 fileid);
-- 
cgit v1.2.3


From 7c2dad99d60c86ec686b3bfdcb787c450a7ea89f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 6 Aug 2015 12:06:30 -0400
Subject: NFS: Don't let the ctime override attribute barriers.

Chuck reports seeing cases where a GETATTR that happens to race
with an asynchronous WRITE is overriding the file size, despite
the attribute barrier being set by the writeback code.

The culprit turns out to be the check in nfs_ctime_need_update(),
which sees that the ctime is newer than the cached ctime, and
assumes that it is safe to override the attribute barrier.
This patch removes that override, and ensures that attribute
barriers are always respected.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Fixes: a08a8cd375db9 ("NFS: Add attribute update barriers to NFS writebacks")
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/inode.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 382c8a46c078..2744d48bbbfe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1267,13 +1267,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	return 0;
 }
 
-static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
-{
-	if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
-		return 0;
-	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
-}
-
 static atomic_long_t nfs_attr_generation_counter;
 
 static unsigned long nfs_read_attr_generation_counter(void)
@@ -1422,7 +1415,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
 	const struct nfs_inode *nfsi = NFS_I(inode);
 
 	return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
-		nfs_ctime_need_update(inode, fattr) ||
 		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
 }
 
-- 
cgit v1.2.3


From 7e94d6c4ab69562423fdc18801050a84462c2787 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Aug 2015 16:55:18 -0500
Subject: NFS: Don't fsync twice for O_SYNC/IS_SYNC files

generic_file_write_iter() will already do an fsync on our behalf
if the file descriptor is O_SYNC or the file is marked as IS_SYNC.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7538a8582384..526a2681d975 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -645,12 +645,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
 	.page_mkwrite = nfs_vm_page_mkwrite,
 };
 
-static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+static int nfs_need_check_write(struct file *filp, struct inode *inode)
 {
 	struct nfs_open_context *ctx;
 
-	if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
-		return 1;
 	ctx = nfs_file_open_context(filp);
 	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
 	    nfs_ctx_key_to_expire(ctx))
@@ -700,8 +698,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 	if (result > 0)
 		written = result;
 
-	/* Return error values for O_DSYNC and IS_SYNC() */
-	if (result >= 0 && nfs_need_sync_write(file, inode)) {
+	/* Return error values */
+	if (result >= 0 && nfs_need_check_write(file, inode)) {
 		int err = vfs_fsync(file, 0);
 		if (err < 0)
 			result = err;
-- 
cgit v1.2.3


From 4ff376feaf57af94e08c8df769e7c48b805ac897 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 18 Aug 2015 23:23:21 -0500
Subject: NFSv4.1/pnfs: Fix a close/delegreturn hang when return-on-close is
 set

The helper pnfs_roc() has already verified that we have no delegations,
and no further open files, hence no outstanding I/O and it has marked
all the return-on-close lsegs as being invalid.
Furthermore, it sets the NFS_LAYOUT_RETURN bit, thus serialising the
close/delegreturn with all future layoutget calls on this inode.

The checks in pnfs_roc_drain() for valid layout segments are therefore
redundant: those cannot exist until another layoutget completes.
The other check for whether or not NFS_LAYOUT_RETURN is set, actually
causes a hang, since we already know that we hold that flag.

To fix, we therefore strip out all the functionality in pnfs_roc_drain()
except the retrieval of the barrier state, and then rename the function
accordingly.

Reported-by: Christoph Hellwig <hch@infradead.org>
Fixes: 5c4a79fb2b1c ("Don't prevent layoutgets when doing return-on-close")
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 12 ++++--------
 fs/nfs/pnfs.c     | 24 +-----------------------
 fs/nfs/pnfs.h     |  7 +++----
 3 files changed, 8 insertions(+), 35 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f4e5816a77b0..bda7837dfe6b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2737,11 +2737,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 
 	if (calldata->arg.fmode == 0) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
-		if (calldata->roc &&
-		    pnfs_roc_drain(inode, &calldata->roc_barrier, task)) {
-			nfs_release_seqid(calldata->arg.seqid);
-			goto out_wait;
-		    }
+		if (calldata->roc)
+			pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
 	}
 	calldata->arg.share_access =
 		nfs4_map_atomic_open_share(NFS_SERVER(inode),
@@ -5289,9 +5286,8 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
 
 	d_data = (struct nfs4_delegreturndata *)data;
 
-	if (d_data->roc &&
-	    pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
-		return;
+	if (d_data->roc)
+		pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
 
 	nfs4_setup_sequence(d_data->res.server,
 			&d_data->args.seq_args,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6151f39c8291..6aabbb654021 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1141,25 +1141,14 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 	spin_unlock(&ino->i_lock);
 }
 
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
 {
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct pnfs_layout_hdr *lo;
-	struct pnfs_layout_segment *lseg;
 	nfs4_stateid stateid;
 	u32 current_seqid;
-	bool layoutreturn = false;
 
 	spin_lock(&ino->i_lock);
-	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) {
-		if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags))
-			continue;
-		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
-			continue;
-		rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-		spin_unlock(&ino->i_lock);
-		return true;
-	}
 	lo = nfsi->layout;
 	current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
 
@@ -1168,18 +1157,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
 	 */
 	*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
 	stateid = lo->plh_stateid;
-	if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-					   &lo->plh_flags))
-		layoutreturn = pnfs_prepare_layoutreturn(lo);
-	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
-		rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-
 	spin_unlock(&ino->i_lock);
-	if (layoutreturn) {
-		pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
-		return true;
-	}
-	return false;
 }
 
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 738672a0f8da..a3d57a8fac76 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -267,7 +267,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -605,10 +605,9 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 {
 }
 
-static inline bool
-pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+static inline void
+pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
 {
-	return false;
 }
 
 static inline void set_pnfs_layoutdriver(struct nfs_server *s,
-- 
cgit v1.2.3


From 3c13cb5b647ebe36fb79128bc8b917d2a3317b65 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 18 Aug 2015 23:45:13 -0500
Subject: NFSv4.1/pnfs: Play safe w.r.t. close() races when return-on-close is
 set

If we have an OPEN_DOWNGRADE and CLOSE race with one another, we want
to ensure that the layout is forgotten by the client, so that we
start afresh with a new layoutget.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bda7837dfe6b..9e9f7816cf24 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2661,7 +2661,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	switch (task->tk_status) {
 		case 0:
 			res_stateid = &calldata->res.stateid;
-			if (calldata->arg.fmode == 0 && calldata->roc)
+			if (calldata->roc)
 				pnfs_roc_set_barrier(state->inode,
 						     calldata->roc_barrier);
 			renew_lease(server, calldata->timestamp);
@@ -2735,11 +2735,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 		goto out_no_action;
 	}
 
-	if (calldata->arg.fmode == 0) {
+	if (calldata->arg.fmode == 0)
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
-		if (calldata->roc)
-			pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
-	}
+	if (calldata->roc)
+		pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
+
 	calldata->arg.share_access =
 		nfs4_map_atomic_open_share(NFS_SERVER(inode),
 				calldata->arg.fmode, 0);
-- 
cgit v1.2.3


From 36319608e28701c07cad80ae3be8b0fdfb1ab40f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Aug 2015 00:14:20 -0500
Subject: Revert "NFSv4: Remove incorrect check in can_open_delegated()"

This reverts commit 4e379d36c050b0117b5d10048be63a44f5036115.

This commit opens up a race between the recovery code and the open code.

Reported-by: Olga Kornievskaia <aglo@umich.edu>
Cc: stable@vger.kernel # v4.0+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9e9f7816cf24..95c5e8d39bef 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1156,6 +1156,8 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
 		return 0;
 	if ((delegation->type & fmode) != fmode)
 		return 0;
+	if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+		return 0;
 	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
 		return 0;
 	nfs_mark_delegation_referenced(delegation);
-- 
cgit v1.2.3


From e755d638e91be254d441602e8d7d9f1d9c944556 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Wed, 19 Aug 2015 13:49:19 +0800
Subject: NFS41: make sure sending LAYOUTRETURN before close if marked so

If layout is marked by NFS_LAYOUT_RETURN_BEFORE_CLOSE, we should always
send LAYOUTRETURN before close, and we don't need to do ROC drain if we
do send LAYOUTRETURN.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 51 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6aabbb654021..e101a491e4e7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1064,7 +1064,7 @@ bool pnfs_roc(struct inode *ino)
 	struct pnfs_layout_segment *lseg, *tmp;
 	nfs4_stateid stateid;
 	LIST_HEAD(tmp_list);
-	bool found = false, layoutreturn = false;
+	bool found = false, layoutreturn = false, roc = false;
 
 	spin_lock(&ino->i_lock);
 	lo = nfsi->layout;
@@ -1072,7 +1072,7 @@ bool pnfs_roc(struct inode *ino)
 	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
 		goto out_noroc;
 
-	/* Don't return layout if we hold a delegation */
+	/* no roc if we hold a delegation */
 	if (nfs4_check_delegation(ino, FMODE_READ))
 		goto out_noroc;
 
@@ -1083,36 +1083,41 @@ bool pnfs_roc(struct inode *ino)
 			goto out_noroc;
 	}
 
+	stateid = lo->plh_stateid;
+	/* always send layoutreturn if being marked so */
+	if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+				   &lo->plh_flags))
+		layoutreturn = pnfs_prepare_layoutreturn(lo);
+
 	pnfs_clear_retry_layoutget(lo);
 	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
-		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+		/* If we are sending layoutreturn, invalidate all valid lsegs */
+		if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
 			mark_lseg_invalid(lseg, &tmp_list);
 			found = true;
 		}
-	if (!found)
-		goto out_noroc;
-	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
-		goto out_noroc;
-	lo->plh_return_iomode = IOMODE_ANY;
-	pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
-	spin_unlock(&ino->i_lock);
-	pnfs_free_lseg_list(&tmp_list);
-	pnfs_layoutcommit_inode(ino, true);
-	return true;
+	/* pnfs_prepare_layoutreturn() grabs lo ref and it will be put
+	 * in pnfs_roc_release(). We don't really send a layoutreturn but
+	 * still want others to view us like we are sending one!
+	 *
+	 * If pnfs_prepare_layoutreturn() fails, it means someone else is doing
+	 * LAYOUTRETURN, so we proceed like there are no layouts to return.
+	 *
+	 * ROC in three conditions:
+	 * 1. there are ROC lsegs
+	 * 2. we don't send layoutreturn
+	 * 3. no others are sending layoutreturn
+	 */
+	if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo))
+		roc = true;
 
 out_noroc:
-	if (lo) {
-		stateid = lo->plh_stateid;
-		if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-					   &lo->plh_flags))
-			layoutreturn = pnfs_prepare_layoutreturn(lo);
-	}
 	spin_unlock(&ino->i_lock);
-	if (layoutreturn) {
-		pnfs_layoutcommit_inode(ino, true);
+	pnfs_free_lseg_list(&tmp_list);
+	pnfs_layoutcommit_inode(ino, true);
+	if (layoutreturn)
 		pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
-	}
-	return false;
+	return roc;
 }
 
 void pnfs_roc_release(struct inode *ino)
-- 
cgit v1.2.3


From 69f230d907e8c1ca3f9bd528993eeb98f712b0dd Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Thu, 20 Aug 2015 01:52:59 +0800
Subject: NFS41/flexfiles: update inode after write finishes

Otherwise we break fstest case tests/read_write/mctime.t

Does files layout need the same fix as well?

Cc: stable@vger.kernel.org # v4.0+
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 13fe64b4e259..4bd3cff94190 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1199,6 +1199,9 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 	    hdr->res.verf->committed == NFS_DATA_SYNC)
 		ff_layout_set_layoutcommit(hdr);
 
+	if (task->tk_status >= 0)
+		nfs_writeback_update_inode(hdr);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c740624989eb87fa7cbd1b5338cef01dd49f1f29 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Aug 2015 23:00:50 -0500
Subject: pNFS: Fix an unused variable warning in pnfs_roc_get_barrier

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e101a491e4e7..8a3f30b695e2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1150,7 +1150,6 @@ void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
 {
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct pnfs_layout_hdr *lo;
-	nfs4_stateid stateid;
 	u32 current_seqid;
 
 	spin_lock(&ino->i_lock);
@@ -1161,7 +1160,6 @@ void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
 	 * a barrier, we choose the worst-case barrier.
 	 */
 	*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
-	stateid = lo->plh_stateid;
 	spin_unlock(&ino->i_lock);
 }
 
-- 
cgit v1.2.3


From 2a606188c55990fa65cba3fd9b64f2b7542b7692 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Aug 2015 22:30:00 -0500
Subject: NFSv4: Enable delegated opens even when reboot recovery is pending

Unlike the previous attempt, this takes into account the fact that
we may be calling it from the recovery thread itself. Detect this
by looking at what kind of open we're doing, and checking the state
of the NFS_DELEGATION_NEED_RECLAIM if it turns out we're doing a
reboot reclaim-type open.

Cc: Olga Kornievskaia <aglo@umich.edu>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 27 +++++++++++++++++++--------
 include/linux/nfs_xdr.h |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 95c5e8d39bef..6e988fd92f69 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1150,16 +1150,25 @@ out:
 	return ret;
 }
 
-static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
+		enum open_claim_type4 claim)
 {
 	if (delegation == NULL)
 		return 0;
 	if ((delegation->type & fmode) != fmode)
 		return 0;
-	if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
-		return 0;
 	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
 		return 0;
+	switch (claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_FH:
+		break;
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+		if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+			break;
+	default:
+		return 0;
+	}
 	nfs_mark_delegation_referenced(delegation);
 	return 1;
 }
@@ -1378,6 +1387,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 	struct nfs_delegation *delegation;
 	int open_mode = opendata->o_arg.open_flags;
 	fmode_t fmode = opendata->o_arg.fmode;
+	enum open_claim_type4 claim = opendata->o_arg.claim;
 	nfs4_stateid stateid;
 	int ret = -EAGAIN;
 
@@ -1391,7 +1401,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 		spin_unlock(&state->owner->so_lock);
 		rcu_read_lock();
 		delegation = rcu_dereference(nfsi->delegation);
-		if (!can_open_delegated(delegation, fmode)) {
+		if (!can_open_delegated(delegation, fmode, claim)) {
 			rcu_read_unlock();
 			break;
 		}
@@ -1854,6 +1864,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_opendata *data = calldata;
 	struct nfs4_state_owner *sp = data->owner;
 	struct nfs_client *clp = sp->so_server->nfs_client;
+	enum open_claim_type4 claim = data->o_arg.claim;
 
 	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
 		goto out_wait;
@@ -1868,15 +1879,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 			goto out_no_action;
 		rcu_read_lock();
 		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
-		if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
-		    data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
-		    can_open_delegated(delegation, data->o_arg.fmode))
+		if (can_open_delegated(delegation, data->o_arg.fmode, claim))
 			goto unlock_no_action;
 		rcu_read_unlock();
 	}
 	/* Update client id. */
 	data->o_arg.clientid = clp->cl_clientid;
-	switch (data->o_arg.claim) {
+	switch (claim) {
+	default:
+		break;
 	case NFS4_OPEN_CLAIM_PREVIOUS:
 	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
 	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7bbe50504211..b9b530409ff7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -389,7 +389,7 @@ struct nfs_openargs {
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const u32 *		bitmask;
 	const u32 *		open_bitmap;
-	__u32			claim;
+	enum open_claim_type4	claim;
 	enum createmode4	createmode;
 	const struct nfs4_label *label;
 };
-- 
cgit v1.2.3


From 046be74da8f257c4f1925ed4b5d4ee4c822ef9c6 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Fri, 21 Aug 2015 10:32:50 +0800
Subject: NFS41: fix list splice type

We want to move commiting pages to pages list instead.
Otherwise it causes pnfs small writes crash like:

[34560.037692] BUG: unable to handle kernel NULL pointer dereference at 0000000000000068
[34560.038557] IP: [<ffffffffa05423d6>] nfs_init_commit+0x26/0x130 [nfs]
[34560.039400] PGD 69f5a067 PUD 69f59067 PMD 0
[34560.040207] Oops: 0000 [#1] SMP
[34560.041014] Modules linked in: nfsv3(OE) nfs_layout_flexfiles(OE) nfsv4(OE) nfs(OE) fscache(E) rpcsec_gss_krb5(E) xt_addrtype(E) xt_conntrack(E) ipt_MASQUERADE(E) nf_nat_masquerade_ipv4(E) iptable_nat(E) nf_conntrack_ipv4(E) nf_defrag_ipv4(E) nf_nat_ipv4(E) iptable_filter(E) ip_tables(E) x_tables(E) nf_nat(E) nf_conntrack(E) bridge(E) stp(E) llc(E) dm_thin_pool(E) dm_persistent_data(E) dm_bio_prison(E) dm_bufio(E) ppdev(E) vmw_balloon(E) coretemp(E) crc32_pclmul(E) ghash_clmulni_intel(E) aesni_intel(E) aes_x86_64(E) glue_helper(E) lrw(E) gf128mul(E) ablk_helper(E) cryptd(E) psmouse(E) serio_raw(E) vmw_vmci(E) i2c_piix4(E) shpchp(E) parport_pc(E) parport(E) mac_hid(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) sunrpc(E) xfs(E) libcrc32c(E) hid_generic(E) usbhid(E) hid(E) e1000(E) mptspi(E)
[34560.045106]  mptscsih(E) mptbase(E) vmwgfx(E) drm_kms_helper(E) ttm(E) drm(E) autofs4(E) [last unloaded: fscache]
[34560.045897] CPU: 0 PID: 130543 Comm: bash Tainted: G           OE   4.2.0-rc5-dp-00057-gf993a93 #11
[34560.046699] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/20/2014
[34560.047525] task: ffff880031b0a980 ti: ffff880045fec000 task.ti: ffff880045fec000
[34560.048264] RIP: 0010:[<ffffffffa05423d6>]  [<ffffffffa05423d6>] nfs_init_commit+0x26/0x130 [nfs]
[34560.049000] RSP: 0018:ffff880045fefc18  EFLAGS: 00010246
[34560.049717] RAX: 0000000000000000 RBX: ffff8800208fbc80 RCX: ffff880045fefd50
[34560.050396] RDX: ffff880031c19ec0 RSI: ffff880045fefc88 RDI: ffff8800208fbc80
[34560.051041] RBP: ffff880045fefc28 R08: ffff8800208fbe68 R09: ffff880045fefc88
[34560.051666] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880045fefc78
[34560.052247] R13: ffff880045fefc88 R14: ffff880045fefa90 R15: ffff880045fefd50
[34560.052825] FS:  00007fa02d58c740(0000) GS:ffff88006d600000(0000) knlGS:0000000000000000
[34560.053410] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[34560.053992] CR2: 0000000000000068 CR3: 000000003b37a000 CR4: 00000000001406f0
[34560.054615] Stack:
[34560.055200]  ffff8800208fbc80 ffff8800208fbc80 ffff880045fefcc8 ffffffffa05c1a5b
[34560.055800]  ffff880045fefcc8 ffff880045fefd50 0000000045fefcb8 ffff880045fefd40
[34560.056418]  ffff8800420608e0 ffffffffa04f3910 0000000100000001 ffff880045fefd50
[34560.057013] Call Trace:
[34560.057672]  [<ffffffffa05c1a5b>] pnfs_generic_commit_pagelist+0x1cb/0x300 [nfsv4]
[34560.058277]  [<ffffffffa04f3910>] ? ff_layout_commit_pagelist+0x20/0x20 [nfs_layout_flexfiles]
[34560.058907]  [<ffffffffa04f3905>] ff_layout_commit_pagelist+0x15/0x20 [nfs_layout_flexfiles]
[34560.059557]  [<ffffffffa0543fc1>] nfs_generic_commit_list+0xb1/0xf0 [nfs]
[34560.060214]  [<ffffffffa0543e47>] ? nfs_scan_commit+0x37/0xa0 [nfs]
[34560.060825]  [<ffffffffa0544081>] nfs_commit_inode+0x81/0x150 [nfs]
[34560.061432]  [<ffffffffa05443ae>] nfs_wb_all+0x1ae/0x400 [nfs]
[34560.062035]  [<ffffffffa05380ad>] nfs_getattr+0x33d/0x510 [nfs]
[34560.062630]  [<ffffffff8122499c>] vfs_getattr_nosec+0x2c/0x40
[34560.063223]  [<ffffffff81224a66>] vfs_getattr+0x26/0x30
[34560.063818]  [<ffffffff81224b35>] vfs_fstatat+0x65/0xa0
[34560.064413]  [<ffffffff81224f3f>] SYSC_newstat+0x1f/0x40
[34560.065016]  [<ffffffff8102b176>] ? do_audit_syscall_entry+0x66/0x70
[34560.065626]  [<ffffffff8102c773>] ? syscall_trace_enter_phase1+0x113/0x170
[34560.066245]  [<ffffffff81003017>] ? trace_hardirqs_on_thunk+0x17/0x19
[34560.066868]  [<ffffffff812251ae>] SyS_newstat+0xe/0x10
[34560.067533]  [<ffffffff817a5df2>] entry_SYSCALL_64_fastpath+0x16/0x7a
[34560.068173] Code: 0f 1f 44 00 00 0f 1f 44 00 00 55 4c 8d 87 e8 01 00 00 48 89 e5 53 48 89 fb 48 83 ec 08 4c 8b 0e 49 8b 41 18 4c 39 ce 48 8b 40 40 <4c> 8b 50 68 74 24 48 8b 87 e8 01 00 00 48 8b 7e 08 4d 89 41 08
[34560.069609] RIP  [<ffffffffa05423d6>] nfs_init_commit+0x26/0x130 [nfs]
[34560.070295]  RSP <ffff880045fefc18>
[34560.071008] CR2: 0000000000000068
[34560.073207] ---[ end trace f85f873260977406 ]---

[fixes 27571297a7e(pNFS: Tighten up locking around DS commit buckets)]
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs_nfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index bbd407b6bc1f..24655b807d44 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -239,7 +239,7 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages,
 
 	bucket = &cinfo->ds->buckets[data->ds_commit_index];
 	spin_lock(cinfo->lock);
-	list_splice_init(pages, &bucket->committing);
+	list_splice_init(&bucket->committing, pages);
 	data->lseg = bucket->clseg;
 	bucket->clseg = NULL;
 	spin_unlock(cinfo->lock);
-- 
cgit v1.2.3


From e76d28dd9ca84fa96dc0054bab34ccec34a7662a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 20 Aug 2015 13:12:51 -0500
Subject: NFSv4.1/pnfs: Ensure the flexfiles layoutstats timers are consistent

We want to ensure that the stopwatches for the busy timer and the
aggregate timer are consistent. This means that they need to use
the same start/stop times.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 51 ++++++++++++++++------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 4bd3cff94190..0fcb8670f25f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -419,42 +419,35 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
 }
 
 static void
-nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 {
 	/* first IO request? */
 	if (atomic_inc_return(&timer->n_ops) == 1) {
-		timer->start_time = ktime_get();
+		timer->start_time = now;
 	}
 }
 
 static ktime_t
-nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 {
-	ktime_t start, now;
+	ktime_t start;
 
 	if (atomic_dec_return(&timer->n_ops) < 0)
 		WARN_ON_ONCE(1);
 
-	now = ktime_get();
 	start = timer->start_time;
 	timer->start_time = now;
 	return ktime_sub(now, start);
 }
 
-static ktime_t
-nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
-{
-	return ktime_sub(ktime_get(), task->tk_start);
-}
-
 static bool
 nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
-			    struct nfs4_ff_layoutstat *layoutstat)
+			    struct nfs4_ff_layoutstat *layoutstat,
+			    ktime_t now)
 {
 	static const ktime_t notime = {0};
-	ktime_t now = ktime_get();
 
-	nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+	nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
 	if (ktime_equal(mirror->start_time, notime))
 		mirror->start_time = now;
 	if (ktime_equal(mirror->last_report_time, notime))
@@ -482,30 +475,33 @@ static void
 nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
 		__u64 requested,
 		__u64 completed,
-		ktime_t time_completed)
+		ktime_t time_completed,
+		ktime_t time_started)
 {
 	struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+	ktime_t completion_time = ktime_sub(time_completed, time_started);
 	ktime_t timer;
 
 	iostat->ops_completed++;
 	iostat->bytes_completed += completed;
 	iostat->bytes_not_delivered += requested - completed;
 
-	timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+	timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
 	iostat->total_busy_time =
 			ktime_add(iostat->total_busy_time, timer);
 	iostat->aggregate_completion_time =
-			ktime_add(iostat->aggregate_completion_time, time_completed);
+			ktime_add(iostat->aggregate_completion_time,
+					completion_time);
 }
 
 static void
 nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
-		__u64 requested)
+		__u64 requested, ktime_t now)
 {
 	bool report;
 
 	spin_lock(&mirror->lock);
-	report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+	report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
 	nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
 	spin_unlock(&mirror->lock);
 
@@ -523,18 +519,18 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
 	spin_lock(&mirror->lock);
 	nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
 			requested, completed,
-			nfs4_ff_layout_calc_completion_time(task));
+			ktime_get(), task->tk_start);
 	spin_unlock(&mirror->lock);
 }
 
 static void
 nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
-		__u64 requested)
+		__u64 requested, ktime_t now)
 {
 	bool report;
 
 	spin_lock(&mirror->lock);
-	report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+	report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
 	nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
 	spin_unlock(&mirror->lock);
 
@@ -555,8 +551,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
 
 	spin_lock(&mirror->lock);
 	nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
-			requested, completed,
-			nfs4_ff_layout_calc_completion_time(task));
+			requested, completed, ktime_get(), task->tk_start);
 	spin_unlock(&mirror->lock);
 }
 
@@ -1063,7 +1058,8 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
 {
 	nfs4_ff_layout_stat_io_start_read(
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count);
+			hdr->args.count,
+			task->tk_start);
 
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
@@ -1249,7 +1245,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
 {
 	nfs4_ff_layout_stat_io_start_write(
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count);
+			hdr->args.count,
+			task->tk_start);
 
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
@@ -1330,7 +1327,7 @@ static void ff_layout_commit_prepare_common(struct rpc_task *task,
 {
 	nfs4_ff_layout_stat_io_start_write(
 			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
-			0);
+			0, task->tk_start);
 }
 
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
-- 
cgit v1.2.3


From dd52128afddecade1dcd0767aec124c13c8a40bb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 20 Aug 2015 17:59:49 -0500
Subject: NFSv4.1/pnfs Ensure flexfiles reports all connection related errors

Make sure that we also handle RPC level connection and protocol
negotiation errors.

Reported-by: Tom Haynes <loghyr@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 48 +++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 13 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0fcb8670f25f..7e4cbdfb3713 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -967,11 +967,36 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
 
 static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 					int idx, u64 offset, u64 length,
-					u32 status, int opnum)
+					u32 status, int opnum, int error)
 {
 	struct nfs4_ff_layout_mirror *mirror;
 	int err;
 
+	if (status == 0) {
+		switch (error) {
+		case -ETIMEDOUT:
+		case -EPFNOSUPPORT:
+		case -EPROTONOSUPPORT:
+		case -EOPNOTSUPP:
+		case -ECONNREFUSED:
+		case -ECONNRESET:
+		case -EHOSTDOWN:
+		case -EHOSTUNREACH:
+		case -ENETUNREACH:
+		case -EADDRINUSE:
+		case -ENOBUFS:
+		case -EPIPE:
+		case -EPERM:
+			status = NFS4ERR_NXIO;
+			break;
+		case -EACCES:
+			status = NFS4ERR_ACCESS;
+			break;
+		default:
+			return;
+		}
+	}
+
 	mirror = FF_LAYOUT_COMP(lseg, idx);
 	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 				       mirror, offset, length, status, opnum,
@@ -988,12 +1013,11 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 	int err;
 
 	trace_nfs4_pnfs_read(hdr, task->tk_status);
-	if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
-		hdr->res.op_status = NFS4ERR_NXIO;
-	if (task->tk_status < 0 && hdr->res.op_status)
+	if (task->tk_status < 0)
 		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
 					    hdr->args.offset, hdr->args.count,
-					    hdr->res.op_status, OP_READ);
+					    hdr->res.op_status, OP_READ,
+					    task->tk_status);
 	err = ff_layout_async_handle_error(task, hdr->args.context->state,
 					   hdr->ds_clp, hdr->lseg,
 					   hdr->pgio_mirror_idx);
@@ -1163,12 +1187,11 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 	int err;
 
 	trace_nfs4_pnfs_write(hdr, task->tk_status);
-	if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
-		hdr->res.op_status = NFS4ERR_NXIO;
-	if (task->tk_status < 0 && hdr->res.op_status)
+	if (task->tk_status < 0)
 		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
 					    hdr->args.offset, hdr->args.count,
-					    hdr->res.op_status, OP_WRITE);
+					    hdr->res.op_status, OP_WRITE,
+					    task->tk_status);
 	err = ff_layout_async_handle_error(task, hdr->args.context->state,
 					   hdr->ds_clp, hdr->lseg,
 					   hdr->pgio_mirror_idx);
@@ -1208,12 +1231,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 	int err;
 
 	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
-	if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
-		data->res.op_status = NFS4ERR_NXIO;
-	if (task->tk_status < 0 && data->res.op_status)
+	if (task->tk_status < 0)
 		ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
 					    data->args.offset, data->args.count,
-					    data->res.op_status, OP_COMMIT);
+					    data->res.op_status, OP_COMMIT,
+					    task->tk_status);
 	err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
 					   data->lseg, data->ds_commit_index);
 
-- 
cgit v1.2.3


From aaae3f00d3f67f681a1f3cb7af999e976e8a24ce Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 20 Aug 2015 18:56:07 -0500
Subject: NFSv4: Force a post-op attribute update when holding a delegation

If the ctime or mtime or change attribute have changed because
of an operation we initiated, we should make sure that we force
an attribute update. However we do not want to mark the page cache
for revalidation.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: stable@vger.kernel.org # v4.0+
---
 fs/nfs/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2744d48bbbfe..e2cc0031decb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1477,6 +1477,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
 {
 	unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
+	/*
+	 * Don't revalidate the pagecache if we hold a delegation, but do
+	 * force an attribute update
+	 */
+	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+		invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED;
+
 	if (S_ISDIR(inode->i_mode))
 		invalid |= NFS_INO_INVALID_DATA;
 	nfs_set_cache_invalid(inode, invalid);
-- 
cgit v1.2.3


From 6a463beb9a433d91f3eaf85c2f0ca0aeddf3e0ab Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 20 Aug 2015 15:40:47 -0500
Subject: NFSv4.1/pnfs: Add a tracepoint for return-on-close events

Allow tracing of return-on-close.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4trace.h | 1 +
 fs/nfs/pnfs.c      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 470af1a78bec..0b67f7825d00 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1136,6 +1136,7 @@ TRACE_EVENT(nfs4_layoutget,
 
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
 
 #endif /* CONFIG_NFS_V4_1 */
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8a3f30b695e2..cf90eeda9d71 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1144,6 +1144,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 	if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
 		lo->plh_barrier = barrier;
 	spin_unlock(&ino->i_lock);
+	trace_nfs4_layoutreturn_on_close(ino, 0);
 }
 
 void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
-- 
cgit v1.2.3


From 7cd148610af9312aa6454395fe174ebfe9496aa1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 20 Aug 2015 20:07:54 -0500
Subject: NFSv4: Add a tracepoint for CB_GETATTR

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c |  6 ++++-
 fs/nfs/nfs4trace.h     | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 624bef79ba7c..07258d269e5a 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
 		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 
 	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
-	if (inode == NULL)
+	if (inode == NULL) {
+		trace_nfs4_cb_getattr(cps->clp, &args->fh, inode,
+				-ntohl(res->status));
 		goto out;
+	}
 	nfsi = NFS_I(inode);
 	rcu_read_lock();
 	delegation = rcu_dereference(nfsi->delegation);
@@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
 	res->status = 0;
 out_iput:
 	rcu_read_unlock();
+	trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
 	iput(inode);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 0b67f7825d00..1aa096b665dc 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -884,6 +884,65 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
 DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
 DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
 
+DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
+		TP_PROTO(
+			const struct nfs_client *clp,
+			const struct nfs_fh *fhandle,
+			const struct inode *inode,
+			int error
+		),
+
+		TP_ARGS(clp, fhandle, inode, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__string(dstaddr, clp ?
+				rpc_peeraddr2str(clp->cl_rpcclient,
+					RPC_DISPLAY_ADDR) : "unknown")
+		),
+
+		TP_fast_assign(
+			__entry->error = error;
+			__entry->fhandle = nfs_fhandle_hash(fhandle);
+			if (inode != NULL) {
+				__entry->fileid = NFS_FILEID(inode);
+				__entry->dev = inode->i_sb->s_dev;
+			} else {
+				__entry->fileid = 0;
+				__entry->dev = 0;
+			}
+			__assign_str(dstaddr, clp ?
+				rpc_peeraddr2str(clp->cl_rpcclient,
+					RPC_DISPLAY_ADDR) : "unknown")
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"dstaddr=%s",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			__get_str(dstaddr)
+		)
+);
+
+#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \
+	DEFINE_EVENT(nfs4_inode_callback_event, name, \
+			TP_PROTO( \
+				const struct nfs_client *clp, \
+				const struct nfs_fh *fhandle, \
+				const struct inode *inode, \
+				int error \
+			), \
+			TP_ARGS(clp, fhandle, inode, error))
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
+
+
 DECLARE_EVENT_CLASS(nfs4_idmap_event,
 		TP_PROTO(
 			const char *name,
-- 
cgit v1.2.3


From 249b2eef647f97164b8bb61f2d9282f227a17992 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 20 Aug 2015 20:43:14 -0500
Subject: NFSv4: Add a tracepoint for CB_LAYOUTRECALL

Only support for single file layoutrecall for now.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c | 3 ++-
 fs/nfs/nfs4trace.h     | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 07258d269e5a..b85cf7a30232 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -41,7 +41,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
 
 	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
 	if (inode == NULL) {
-		trace_nfs4_cb_getattr(cps->clp, &args->fh, inode,
+		trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL,
 				-ntohl(res->status));
 		goto out;
 	}
@@ -198,6 +198,7 @@ unlock:
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&free_me_list);
 	pnfs_put_layout_hdr(lo);
+	trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
 	iput(ino);
 out:
 	return rv;
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 1aa096b665dc..28df12e525ba 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -941,6 +941,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
 			), \
 			TP_ARGS(clp, fhandle, inode, error))
 DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
 
 
 DECLARE_EVENT_CLASS(nfs4_idmap_event,
-- 
cgit v1.2.3


From 3976143b063e3c42fe5471d87860f6ae118e0eee Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Fri, 21 Aug 2015 12:49:44 +0800
Subject: NFS41: remove NFS_LAYOUT_ROC flag

If we return delegation before closing, we fail to do roc check
during close because NFS_LAYOUT_ROC is cleared by delegreturn
and it causes layouts to be still hanging around after delegreturn
+ close, which is a voilation against protocol.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 7 ++-----
 fs/nfs/pnfs.h | 1 -
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cf90eeda9d71..247c5a5d2d6b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1068,8 +1068,7 @@ bool pnfs_roc(struct inode *ino)
 
 	spin_lock(&ino->i_lock);
 	lo = nfsi->layout;
-	if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
-	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+	if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
 		goto out_noroc;
 
 	/* no roc if we hold a delegation */
@@ -1617,10 +1616,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	pnfs_get_lseg(lseg);
 	pnfs_layout_insert_lseg(lo, lseg);
 
-	if (res->return_on_close) {
+	if (res->return_on_close)
 		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
-		set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
-	}
 
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&free_me);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a3d57a8fac76..02c27f93caf1 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,7 +94,6 @@ enum {
 	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
-	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
 	NFS_LAYOUT_RETURN,		/* Return this layout ASAP */
 	NFS_LAYOUT_RETURN_BEFORE_CLOSE,	/* Return this layout before close */
 	NFS_LAYOUT_INVALID_STID,	/* layout stateid id is invalid */
-- 
cgit v1.2.3


From 5420401079e152ff68a8024f6a375804b1c21505 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Sat, 22 Aug 2015 06:40:00 +0800
Subject: NFS41/flexfiles: zero out DS write wcc

We do not want to update inode attributes with DS values.

Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7e4cbdfb3713..cc2c5f7f2bc1 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1218,6 +1218,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 	    hdr->res.verf->committed == NFS_DATA_SYNC)
 		ff_layout_set_layoutcommit(hdr);
 
+	/* zero out fattr since we don't care DS attr at all */
+	hdr->fattr.valid = 0;
 	if (task->tk_status >= 0)
 		nfs_writeback_update_inode(hdr);
 
-- 
cgit v1.2.3


From 28a0d72c6867af307c000e068a6da93ae98bcd0c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 24 Aug 2015 18:08:30 -0400
Subject: NFSv4.1/flexfiles: Add refcounting to struct nfs4_ff_layout_mirror

We do want to share mirrors between layout segments, so add a refcount
to enable that.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 36 +++++++++++++++++++++++++---------
 fs/nfs/flexfilelayout/flexfilelayout.h |  1 +
 2 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index cc2c5f7f2bc1..62de0b8038c8 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -135,6 +135,31 @@ decode_name(struct xdr_stream *xdr, u32 *id)
 	return 0;
 }
 
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+
+	mirror = kzalloc(sizeof(*mirror), gfp_flags);
+	if (mirror != NULL) {
+		spin_lock_init(&mirror->lock);
+		atomic_set(&mirror->ref, 1);
+	}
+	return mirror;
+}
+
+static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+	kfree(mirror->fh_versions);
+	nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+	kfree(mirror);
+}
+
+static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+	if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
+		ff_layout_free_mirror(mirror);
+}
+
 static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 {
 	int i;
@@ -144,11 +169,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 			/* normally mirror_ds is freed in
 			 * .free_deviceid_node but we still do it here
 			 * for .alloc_lseg error path */
-			if (fls->mirror_array[i]) {
-				kfree(fls->mirror_array[i]->fh_versions);
-				nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
-				kfree(fls->mirror_array[i]);
-			}
+			ff_layout_put_mirror(fls->mirror_array[i]);
 		}
 		kfree(fls->mirror_array);
 		fls->mirror_array = NULL;
@@ -262,15 +283,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		if (ds_count != 1)
 			goto out_err_free;
 
-		fls->mirror_array[i] =
-			kzalloc(sizeof(struct nfs4_ff_layout_mirror),
-				gfp_flags);
+		fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
 		if (fls->mirror_array[i] == NULL) {
 			rc = -ENOMEM;
 			goto out_err_free;
 		}
 
-		spin_lock_init(&fls->mirror_array[i]->lock);
 		fls->mirror_array[i]->ds_count = ds_count;
 		fls->mirror_array[i]->lseg = &fls->generic_hdr;
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index f92f9a0a856b..26b8258e256f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -77,6 +77,7 @@ struct nfs4_ff_layout_mirror {
 	u32				uid;
 	u32				gid;
 	struct rpc_cred			*cred;
+	atomic_t			ref;
 	spinlock_t			lock;
 	struct nfs4_ff_layoutstat	read_stat;
 	struct nfs4_ff_layoutstat	write_stat;
-- 
cgit v1.2.3


From 0b7baf9433d4e0a0c83eecbadc1f035d5370c6dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 24 Aug 2015 18:22:28 -0400
Subject: NFSv4.1/flexfiles: Remove mirror backpointer to lseg.

When we start sharing mirrors between several lsegs, we won't be able to
keep it.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 25 ++++++++++++-------------
 fs/nfs/flexfilelayout/flexfilelayout.h |  1 -
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 62de0b8038c8..f3efff640989 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -290,7 +290,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		}
 
 		fls->mirror_array[i]->ds_count = ds_count;
-		fls->mirror_array[i]->lseg = &fls->generic_hdr;
 
 		/* deviceid */
 		rc = decode_deviceid(&stream, &devid);
@@ -513,7 +512,8 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
 }
 
 static void
-nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
+nfs4_ff_layout_stat_io_start_read(struct inode *inode,
+		struct nfs4_ff_layout_mirror *mirror,
 		__u64 requested, ktime_t now)
 {
 	bool report;
@@ -524,8 +524,7 @@ nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
 	spin_unlock(&mirror->lock);
 
 	if (report)
-		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode,
-				GFP_KERNEL);
+		pnfs_report_layoutstat(inode, GFP_KERNEL);
 }
 
 static void
@@ -542,7 +541,8 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
 }
 
 static void
-nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
+nfs4_ff_layout_stat_io_start_write(struct inode *inode,
+		struct nfs4_ff_layout_mirror *mirror,
 		__u64 requested, ktime_t now)
 {
 	bool report;
@@ -553,8 +553,7 @@ nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
 	spin_unlock(&mirror->lock);
 
 	if (report)
-		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode,
-				GFP_NOIO);
+		pnfs_report_layoutstat(inode, GFP_NOIO);
 }
 
 static void
@@ -1098,7 +1097,7 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
 static int ff_layout_read_prepare_common(struct rpc_task *task,
 					 struct nfs_pgio_header *hdr)
 {
-	nfs4_ff_layout_stat_io_start_read(
+	nfs4_ff_layout_stat_io_start_read(hdr->inode,
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
 			hdr->args.count,
 			task->tk_start);
@@ -1285,7 +1284,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 static int ff_layout_write_prepare_common(struct rpc_task *task,
 					  struct nfs_pgio_header *hdr)
 {
-	nfs4_ff_layout_stat_io_start_write(
+	nfs4_ff_layout_stat_io_start_write(hdr->inode,
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
 			hdr->args.count,
 			task->tk_start);
@@ -1367,7 +1366,7 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
 static void ff_layout_commit_prepare_common(struct rpc_task *task,
 		struct nfs_commit_data *cdata)
 {
-	nfs4_ff_layout_stat_io_start_write(
+	nfs4_ff_layout_stat_io_start_write(cdata->inode,
 			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
 			0, task->tk_start);
 }
@@ -1912,8 +1911,8 @@ ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
 		devinfo->layout_type = LAYOUT_FLEX_FILES;
 		devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
 		devinfo->layout_private = mirror;
-		/* lseg refcount put in cleanup_layoutstats */
-		pnfs_get_lseg(pls);
+		/* mirror refcount put in cleanup_layoutstats */
+		atomic_inc(&mirror->ref);
 
 		++(*dev_count);
 	}
@@ -1965,7 +1964,7 @@ ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
 	for (i = 0; i < data->args.num_dev; i++) {
 		mirror = data->args.devinfo[i].layout_private;
 		data->args.devinfo[i].layout_private = NULL;
-		pnfs_put_lseg(mirror->lseg);
+		ff_layout_put_mirror(mirror);
 	}
 }
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 26b8258e256f..fe9d3ff7cf85 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -67,7 +67,6 @@ struct nfs4_ff_layoutstat {
 };
 
 struct nfs4_ff_layout_mirror {
-	struct pnfs_layout_segment	*lseg; /* back pointer */
 	u32				ds_count;
 	u32				efficiency;
 	struct nfs4_ff_layout_ds	*mirror_ds;
-- 
cgit v1.2.3


From 266d12d42ebd2587d3d0e8c4a4e54bfb5ed29543 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 24 Aug 2015 20:03:17 -0400
Subject: NFSv4.1/flexfile: Ensure uniqueness of mirrors across layout segments

Keep the full list of mirrors in the struct nfs4_ff_layout_mirror so that
they can be shared among the layout segments that use them.
Also ensure that we send out only one copy of the layoutstats per mirror.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 125 +++++++++++++++++++++++++--------
 fs/nfs/flexfilelayout/flexfilelayout.h |   3 +
 2 files changed, 99 insertions(+), 29 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index f3efff640989..0fbf37de2a41 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 	ffl = kzalloc(sizeof(*ffl), gfp_flags);
 	if (ffl) {
 		INIT_LIST_HEAD(&ffl->error_list);
+		INIT_LIST_HEAD(&ffl->mirrors);
 		return &ffl->generic_hdr;
 	} else
 		return NULL;
@@ -135,6 +136,66 @@ decode_name(struct xdr_stream *xdr, u32 *id)
 	return 0;
 }
 
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+		const struct nfs4_ff_layout_mirror *m2)
+{
+	int i, j;
+
+	if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+		return false;
+	for (i = 0; i < m1->fh_versions_cnt; i++) {
+		bool found_fh = false;
+		for (j = 0; j < m2->fh_versions_cnt; i++) {
+			if (nfs_compare_fh(&m1->fh_versions[i],
+					&m2->fh_versions[j]) == 0) {
+				found_fh = true;
+				break;
+			}
+		}
+		if (!found_fh)
+			return false;
+	}
+	return true;
+}
+
+static struct nfs4_ff_layout_mirror *
+ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
+		struct nfs4_ff_layout_mirror *mirror)
+{
+	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+	struct nfs4_ff_layout_mirror *pos;
+	struct inode *inode = lo->plh_inode;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
+		if (mirror->mirror_ds != pos->mirror_ds)
+			continue;
+		if (!ff_mirror_match_fh(mirror, pos))
+			continue;
+		if (atomic_inc_not_zero(&pos->ref)) {
+			spin_unlock(&inode->i_lock);
+			return pos;
+		}
+	}
+	list_add(&mirror->mirrors, &ff_layout->mirrors);
+	mirror->layout = lo;
+	spin_unlock(&inode->i_lock);
+	return mirror;
+}
+
+void
+ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+	struct inode *inode;
+	if (mirror->layout == NULL)
+		return;
+	inode = mirror->layout->plh_inode;
+	spin_lock(&inode->i_lock);
+	list_del(&mirror->mirrors);
+	spin_unlock(&inode->i_lock);
+	mirror->layout = NULL;
+}
+
 static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 {
 	struct nfs4_ff_layout_mirror *mirror;
@@ -143,12 +204,14 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 	if (mirror != NULL) {
 		spin_lock_init(&mirror->lock);
 		atomic_set(&mirror->ref, 1);
+		INIT_LIST_HEAD(&mirror->mirrors);
 	}
 	return mirror;
 }
 
 static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 {
+	ff_layout_remove_mirror(mirror);
 	kfree(mirror->fh_versions);
 	nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
 	kfree(mirror);
@@ -267,6 +330,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		goto out_err_free;
 
 	for (i = 0; i < fls->mirror_array_cnt; i++) {
+		struct nfs4_ff_layout_mirror *mirror;
 		struct nfs4_deviceid devid;
 		struct nfs4_deviceid_node *idnode;
 		u32 ds_count;
@@ -355,6 +419,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		if (rc)
 			goto out_err_free;
 
+		mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
+		if (mirror != fls->mirror_array[i]) {
+			ff_layout_free_mirror(fls->mirror_array[i]);
+			fls->mirror_array[i] = mirror;
+		}
+
 		dprintk("%s: uid %d gid %d\n", __func__,
 			fls->mirror_array[i]->uid,
 			fls->mirror_array[i]->gid);
@@ -1883,27 +1953,30 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
 	*start = cpu_to_be32((xdr->p - start - 1) * 4);
 }
 
-static bool
+static int
 ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
-			       struct pnfs_layout_segment *pls,
-			       int *dev_count, int dev_limit)
+			       struct pnfs_layout_hdr *lo,
+			       int dev_limit)
 {
+	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_deviceid_node *dev;
 	struct nfs42_layoutstat_devinfo *devinfo;
-	int i;
+	int i = 0;
 
-	for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) {
-		if (*dev_count >= dev_limit)
+	list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+		if (i >= dev_limit)
 			break;
-		mirror = FF_LAYOUT_COMP(pls, i);
-		if (!mirror || !mirror->mirror_ds)
+		if (!mirror->mirror_ds)
 			continue;
-		dev = FF_LAYOUT_DEVID_NODE(pls, i);
-		devinfo = &args->devinfo[*dev_count];
+		/* mirror refcount put in cleanup_layoutstats */
+		if (!atomic_inc_not_zero(&mirror->ref))
+			continue;
+		dev = &mirror->mirror_ds->id_node; 
+		devinfo = &args->devinfo[i];
 		memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
-		devinfo->offset = pls->pls_range.offset;
-		devinfo->length = pls->pls_range.length;
+		devinfo->offset = 0;
+		devinfo->length = NFS4_MAX_UINT64;
 		devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
 		devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
 		devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
@@ -1911,24 +1984,24 @@ ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
 		devinfo->layout_type = LAYOUT_FLEX_FILES;
 		devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
 		devinfo->layout_private = mirror;
-		/* mirror refcount put in cleanup_layoutstats */
-		atomic_inc(&mirror->ref);
 
-		++(*dev_count);
+		i++;
 	}
-
-	return *dev_count < dev_limit;
+	return i;
 }
 
 static int
 ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 {
-	struct pnfs_layout_segment *pls;
+	struct nfs4_flexfile_layout *ff_layout;
+	struct nfs4_ff_layout_mirror *mirror;
 	int dev_count = 0;
 
 	spin_lock(&args->inode->i_lock);
-	list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
-		dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+	ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+	list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+		if (atomic_read(&mirror->ref) != 0)
+			dev_count ++;
 	}
 	spin_unlock(&args->inode->i_lock);
 	/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
@@ -1937,20 +2010,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 			__func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
 		dev_count = PNFS_LAYOUTSTATS_MAXDEV;
 	}
-	args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+	args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
 	if (!args->devinfo)
 		return -ENOMEM;
 
-	dev_count = 0;
 	spin_lock(&args->inode->i_lock);
-	list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
-		if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
-						    PNFS_LAYOUTSTATS_MAXDEV)) {
-			break;
-		}
-	}
+	args->num_dev = ff_layout_mirror_prepare_stats(args,
+			&ff_layout->generic_hdr, dev_count);
 	spin_unlock(&args->inode->i_lock);
-	args->num_dev = dev_count;
 
 	return 0;
 }
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index fe9d3ff7cf85..68cc0d9828f9 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -67,6 +67,8 @@ struct nfs4_ff_layoutstat {
 };
 
 struct nfs4_ff_layout_mirror {
+	struct pnfs_layout_hdr		*layout;
+	struct list_head		mirrors;
 	u32				ds_count;
 	u32				efficiency;
 	struct nfs4_ff_layout_ds	*mirror_ds;
@@ -95,6 +97,7 @@ struct nfs4_ff_layout_segment {
 struct nfs4_flexfile_layout {
 	struct pnfs_layout_hdr generic_hdr;
 	struct pnfs_ds_commit_info commit_info;
+	struct list_head	mirrors;
 	struct list_head	error_list; /* nfs4_ff_layout_ds_err */
 };
 
-- 
cgit v1.2.3


From bbf58bf3488e41f346536aa89d62bdf2fe771128 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 24 Aug 2015 20:39:18 -0400
Subject: NFSv4.2/pnfs: Make the layoutstats timer configurable

Allow advanced users to set the layoutstats timer in order to lengthen
or shorten the period between layoutstat transmissions to the server.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 Documentation/kernel-parameters.txt    | 9 +++++++++
 fs/nfs/flexfilelayout/flexfilelayout.c | 5 ++++-
 fs/nfs/pnfs.c                          | 4 ++++
 fs/nfs/pnfs.h                          | 3 +++
 4 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 1d6f0459cd7b..30d78b561574 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2279,6 +2279,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			The default parameter value of '0' causes the kernel
 			not to attempt recovery of lost locks.
 
+	nfs4.layoutstats_timer =
+			[NFSv4.2] Change the rate at which the kernel sends
+			layoutstats to the pNFS metadata server.
+
+			Setting this to value to 0 causes the kernel to use
+			whatever value is the default set by the layout
+			driver. A non-zero value sets the minimum interval
+			in seconds between layoutstats transmissions.
+
 	nfsd.nfs4_disable_idmapping=
 			[NFSv4] When set to the default of '1', the NFSv4
 			server will return only numeric uids and gids to
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0fbf37de2a41..9f6fb8876b3f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -533,14 +533,17 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 			    ktime_t now)
 {
 	static const ktime_t notime = {0};
+	s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
 
 	nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
 	if (ktime_equal(mirror->start_time, notime))
 		mirror->start_time = now;
 	if (ktime_equal(mirror->last_report_time, notime))
 		mirror->last_report_time = now;
+	if (layoutstats_timer != 0)
+		report_interval = (s64)layoutstats_timer * 1000LL;
 	if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
-			FF_LAYOUTSTATS_REPORT_INTERVAL) {
+			report_interval) {
 		mirror->last_report_time = now;
 		return true;
 	}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 247c5a5d2d6b..3530bb703214 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2285,3 +2285,7 @@ out_put:
 }
 EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
 #endif
+
+unsigned int layoutstats_timer;
+module_param(layoutstats_timer, uint, 0644);
+EXPORT_SYMBOL_GPL(layoutstats_timer);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 02c27f93caf1..d3979dd1037a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -528,12 +528,15 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
 					nfss->pnfs_curr_ld->id == src->l_type);
 }
 
+extern unsigned int layoutstats_timer;
+
 #ifdef NFS_DEBUG
 void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 #else
 static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
 {
 }
+
 #endif /* NFS_DEBUG */
 #else  /* CONFIG_NFS_V4_1 */
 
-- 
cgit v1.2.3


From e3b1df2dbd7bcda807a94db131fda6c2bbd1480a Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Tue, 25 Aug 2015 11:19:25 +0800
Subject: NFSv4.1/flexfile: ff_layout_remove_mirror can be static

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 9f6fb8876b3f..7fefa8ad9578 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -183,7 +183,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
 	return mirror;
 }
 
-void
+static void
 ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
 {
 	struct inode *inode;
-- 
cgit v1.2.3


From 82714bd1424a88e4bb43813c8a78fbe8f6c5feab Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 25 Aug 2015 08:41:24 -0400
Subject: NFSv4.1/pnfs Improve the packing of struct pnfs_layout_hdr

Eliminate a couple of holes in the structure, and move the 2 atomics
into the same cacheline.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d3979dd1037a..4df87ef3dccc 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -183,15 +183,15 @@ struct pnfs_layoutdriver_type {
 
 struct pnfs_layout_hdr {
 	atomic_t		plh_refcount;
+	atomic_t		plh_outstanding; /* number of RPCs out */
 	struct list_head	plh_layouts;   /* other client layouts */
 	struct list_head	plh_bulk_destroy;
 	struct list_head	plh_segs;      /* layout segments list */
-	nfs4_stateid		plh_stateid;
-	atomic_t		plh_outstanding; /* number of RPCs out */
 	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
-	u32			plh_barrier; /* ignore lower seqids */
 	unsigned long		plh_retry_timestamp;
 	unsigned long		plh_flags;
+	nfs4_stateid		plh_stateid;
+	u32			plh_barrier; /* ignore lower seqids */
 	enum pnfs_iomode	plh_return_iomode;
 	loff_t			plh_lwb; /* last write byte for layoutcommit */
 	struct rpc_cred		*plh_lc_cred; /* layoutcommit cred */
-- 
cgit v1.2.3


From 540d9864e1c5f8d3ca2ecc919e7d8a47e713ec3f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 25 Aug 2015 11:16:13 -0400
Subject: NFSv4.1/pnfs: Add sanity check for the layout range returned by the
 server

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3530bb703214..68cc4b169769 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1561,6 +1561,26 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(pnfs_update_layout);
 
+static bool
+pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
+{
+	switch (range->iomode) {
+	case IOMODE_READ:
+	case IOMODE_RW:
+		break;
+	default:
+		return false;
+	}
+	if (range->offset == NFS4_MAX_UINT64)
+		return false;
+	if (range->length == 0)
+		return false;
+	if (range->length != NFS4_MAX_UINT64 &&
+	    range->length > NFS4_MAX_UINT64 - range->offset)
+		return false;
+	return true;
+}
+
 struct pnfs_layout_segment *
 pnfs_layout_process(struct nfs4_layoutget *lgp)
 {
@@ -1569,7 +1589,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	struct pnfs_layout_segment *lseg;
 	struct inode *ino = lo->plh_inode;
 	LIST_HEAD(free_me);
-	int status = 0;
+	int status = -EINVAL;
+
+	if (!pnfs_sanity_check_layout_range(&res->range))
+		goto out;
 
 	/* Inject layout blob into I/O device driver */
 	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
-- 
cgit v1.2.3


From 03772d2f00313bb22d91b2019cb8e6e91b415653 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 25 Aug 2015 08:54:17 -0400
Subject: NFSv4.1/pnfs: Allow pNFS device drivers to customise layout segment
 insertion

This is needed in order to allow merging of contiguous layout segments,
and also to correct the ordering of layouts for those device drivers that
don't necessarily want to place the read-write layouts first.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
 fs/nfs/pnfs.h | 11 +++++++++++
 2 files changed, 61 insertions(+), 9 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 68cc4b169769..914c1daf08df 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1188,16 +1188,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
 	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
 }
 
-static void
-pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
-		   struct pnfs_layout_segment *lseg)
+static bool
+pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
+		const struct pnfs_layout_range *l2)
+{
+	return pnfs_lseg_range_cmp(l1, l2) > 0;
+}
+
+static bool
+pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
+		struct pnfs_layout_segment *old)
+{
+	return false;
+}
+
+void
+pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg,
+		   bool (*is_after)(const struct pnfs_layout_range *,
+			   const struct pnfs_layout_range *),
+		   bool (*do_merge)(struct pnfs_layout_segment *,
+			   struct pnfs_layout_segment *),
+		   struct list_head *free_me)
 {
-	struct pnfs_layout_segment *lp;
+	struct pnfs_layout_segment *lp, *tmp;
 
 	dprintk("%s:Begin\n", __func__);
 
-	list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-		if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
+	list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
+		if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
+			continue;
+		if (do_merge(lseg, lp)) {
+			mark_lseg_invalid(lp, free_me);
+			continue;
+		}
+		if (is_after(&lseg->pls_range, &lp->pls_range))
 			continue;
 		list_add_tail(&lseg->pls_list, &lp->pls_list);
 		dprintk("%s: inserted lseg %p "
@@ -1219,6 +1244,24 @@ out:
 
 	dprintk("%s:Return\n", __func__);
 }
+EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
+
+static void
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg,
+		   struct list_head *free_me)
+{
+	struct inode *inode = lo->plh_inode;
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld->add_lseg != NULL)
+		ld->add_lseg(lo, lseg, free_me);
+	else
+		pnfs_generic_layout_insert_lseg(lo, lseg,
+				pnfs_lseg_range_is_after,
+				pnfs_lseg_no_merge,
+				free_me);
+}
 
 static struct pnfs_layout_hdr *
 alloc_init_layout_hdr(struct inode *ino,
@@ -1311,8 +1354,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 			ret = pnfs_get_lseg(lseg);
 			break;
 		}
-		if (lseg->pls_range.offset > range->offset)
-			break;
 	}
 
 	dprintk("%s:Return lseg %p ref %d\n",
@@ -1637,7 +1678,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 
 	pnfs_get_lseg(lseg);
-	pnfs_layout_insert_lseg(lo, lseg);
+	pnfs_layout_insert_lseg(lo, lseg, &free_me);
 
 	if (res->return_on_close)
 		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4df87ef3dccc..869069d8b996 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -128,6 +128,9 @@ struct pnfs_layoutdriver_type {
 
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
+	void (*add_lseg) (struct pnfs_layout_hdr *layoutid,
+			struct pnfs_layout_segment *lseg,
+			struct list_head *free_me);
 
 	void (*return_range) (struct pnfs_layout_hdr *lo,
 			      struct pnfs_layout_range *range);
@@ -285,6 +288,14 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       gfp_t gfp_flags);
 void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
 
+void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg,
+		   bool (*is_after)(const struct pnfs_layout_range *lseg_range,
+			   const struct pnfs_layout_range *old),
+		   bool (*do_merge)(struct pnfs_layout_segment *lseg,
+			   struct pnfs_layout_segment *old),
+		   struct list_head *free_me);
+
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
 int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
-- 
cgit v1.2.3


From 0762ed2ced40bfe648378ef0e1635cf4d3a6fb76 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 25 Aug 2015 17:38:25 -0400
Subject: NFSv4.1/flexfiles: Allow coalescing of new layout segments and
 existing ones

In order to ensure atomicity of updates, we merge the old layout segments
into the new ones, and then invalidate the old ones.

Also ensure that we order the list of layout segments so that
RO segments are preferred over RW.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 60 ++++++++++++++++++++++++++++++++++
 fs/nfs/pnfs.h                          | 16 +++++++++
 2 files changed, 76 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7fefa8ad9578..4ec624cfcf8b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -265,6 +265,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
 	}
 }
 
+static bool
+ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
+		const struct pnfs_layout_range *l2)
+{
+	u64 end1, end2;
+
+	if (l1->iomode != l2->iomode)
+		return l1->iomode != IOMODE_READ;
+	end1 = pnfs_calc_offset_end(l1->offset, l1->length);
+	end2 = pnfs_calc_offset_end(l2->offset, l2->length);
+	if (end1 < l2->offset)
+		return false;
+	if (end2 < l1->offset)
+		return true;
+	return l2->offset <= l1->offset;
+}
+
+static bool
+ff_lseg_merge(struct pnfs_layout_segment *new,
+		struct pnfs_layout_segment *old)
+{
+	u64 new_end, old_end;
+
+	if (new->pls_range.iomode != old->pls_range.iomode)
+		return false;
+	old_end = pnfs_calc_offset_end(old->pls_range.offset,
+			old->pls_range.length);
+	if (old_end < new->pls_range.offset)
+		return false;
+	new_end = pnfs_calc_offset_end(new->pls_range.offset,
+			new->pls_range.length);
+	if (new_end < old->pls_range.offset)
+		return false;
+
+	/* Mergeable: copy info from 'old' to 'new' */
+	if (new_end < old_end)
+		new_end = old_end;
+	if (new->pls_range.offset < old->pls_range.offset)
+		new->pls_range.offset = old->pls_range.offset;
+	new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
+			new_end);
+	if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
+		set_bit(NFS_LSEG_ROC, &new->pls_flags);
+	if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+		set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
+	return true;
+}
+
+static void
+ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_segment *lseg,
+		struct list_head *free_me)
+{
+	pnfs_generic_layout_insert_lseg(lo, lseg,
+			ff_lseg_range_is_after,
+			ff_lseg_merge,
+			free_me);
+}
+
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 {
 	int i, j;
@@ -2046,6 +2105,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
 	.free_layout_hdr	= ff_layout_free_layout_hdr,
 	.alloc_lseg		= ff_layout_alloc_lseg,
 	.free_lseg		= ff_layout_free_lseg,
+	.add_lseg		= ff_layout_add_lseg,
 	.pg_read_ops		= &ff_layout_pg_read_ops,
 	.pg_write_ops		= &ff_layout_pg_write_ops,
 	.get_ds_info		= ff_layout_get_ds_info,
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 869069d8b996..78c9351ff117 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -539,6 +539,22 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
 					nfss->pnfs_curr_ld->id == src->l_type);
 }
 
+static inline u64
+pnfs_calc_offset_end(u64 offset, u64 len)
+{
+	if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset)
+		return NFS4_MAX_UINT64;
+	return offset + len - 1;
+}
+
+static inline u64
+pnfs_calc_offset_length(u64 offset, u64 end)
+{
+	if (end == NFS4_MAX_UINT64 || end <= offset)
+		return NFS4_MAX_UINT64;
+	return 1 + end - offset;
+}
+
 extern unsigned int layoutstats_timer;
 
 #ifdef NFS_DEBUG
-- 
cgit v1.2.3


From 19cf6335134d82be792831e14aae9d037d0cb30b Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Wed, 26 Aug 2015 00:13:15 +0800
Subject: nfs42: decode_layoutstats does not need res parameter

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42xdr.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index a6bd27da6286..0eb29e14070d 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -238,8 +238,7 @@ out_overflow:
 	return -EIO;
 }
 
-static int decode_layoutstats(struct xdr_stream *xdr,
-			      struct nfs42_layoutstat_res *res)
+static int decode_layoutstats(struct xdr_stream *xdr)
 {
 	return decode_op_hdr(xdr, OP_LAYOUTSTATS);
 }
@@ -343,7 +342,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
 		goto out;
 	WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
 	for (i = 0; i < res->num_dev; i++) {
-		status = decode_layoutstats(xdr, res);
+		status = decode_layoutstats(xdr);
 		if (status)
 			goto out;
 	}
-- 
cgit v1.2.3


From 1090c3bf81ef12eb22383d4b6627d372ce5115d9 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Wed, 26 Aug 2015 00:13:16 +0800
Subject: nfs42: remove unused declaration

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index ff66ae700b89..814c1255f1d2 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,7 +17,5 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
 				   struct nfs42_layoutstat_data *);
-/* nfs4.2xdr.h */
-extern struct rpc_procinfo nfs4_2_procedures[];
 
 #endif /* __LINUX_FS_NFS_NFS4_2_H */
-- 
cgit v1.2.3


From 0bdb8fa6ecb4a356f26c6874db51b5488706e088 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 27 Aug 2015 19:17:33 -0400
Subject: NFSv4.1/pNFS: pnfs_mark_matching_lsegs_return must notify of layout
 return

It's not sufficient to just mark the layout segment for layout return. We
also need to set the NFS_LAYOUT_RETURN_BEFORE_CLOSE flag in the layout header.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 914c1daf08df..4eec540de9ea 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1717,6 +1717,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
 				lseg->pls_range.length);
 			set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
 			mark_lseg_invalid(lseg, tmp_list);
+			set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+					&lo->plh_flags);
 		}
 }
 
-- 
cgit v1.2.3


From ae57ca0f4fce219ef34c28f0edc210598c465a4d Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 26 Aug 2015 21:10:55 +0800
Subject: NFS: Check size by inode_newsize_ok in nfs_setattr

Set rlimit for NFS's files is useless right now.
For local process's rlimit, it should be checked by nfs client.

The same, CIFS also call inode_change_ok checking rlimit at its client
in cifs_setattr_nounix() and cifs_setattr_unix().

v3, fix bad using of error

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/inode.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e2cc0031decb..99a68bd9c178 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -504,7 +504,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct nfs_fattr *fattr;
-	int error = -ENOMEM;
+	int error = 0;
 
 	nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
 
@@ -513,15 +513,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		attr->ia_valid &= ~ATTR_MODE;
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		loff_t i_size;
-
 		BUG_ON(!S_ISREG(inode->i_mode));
 
-		i_size = i_size_read(inode);
-		if (attr->ia_size == i_size)
+		error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+
+		if (attr->ia_size == i_size_read(inode))
 			attr->ia_valid &= ~ATTR_SIZE;
-		else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
-			return -ETXTBSY;
 	}
 
 	/* Optimization: if the end result is no change, don't RPC */
@@ -536,8 +535,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		nfs_sync_inode(inode);
 
 	fattr = nfs_alloc_fattr();
-	if (fattr == NULL)
+	if (fattr == NULL) {
+		error = -ENOMEM;
 		goto out;
+	}
+
 	/*
 	 * Return any delegations if we're going to change ACLs
 	 */
-- 
cgit v1.2.3


From c5c3fb5f975a8bcc42cd039b83d9a4729ce489bb Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 26 Aug 2015 21:11:39 +0800
Subject: NFS: Make opened as optional argument in _nfs4_do_open

Check opened, only update it when non-NULL.
It's not needs define an unused value for the opened
when calling _nfs4_do_open.

v3, same as v2.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4file.c | 3 +--
 fs/nfs/nfs4proc.c | 5 ++---
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index dcd39d4e2efe..43f1590b9240 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -27,7 +27,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	struct inode *dir;
 	unsigned openflags = filp->f_flags;
 	struct iattr attr;
-	int opened = 0;
 	int err;
 
 	/*
@@ -66,7 +65,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 		nfs_sync_inode(inode);
 	}
 
-	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		switch (err) {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6e988fd92f69..4687661bfbdc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2452,7 +2452,7 @@ static int _nfs4_do_open(struct inode *dir,
 			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
 		}
 	}
-	if (opendata->file_created)
+	if (opened && opendata->file_created)
 		*opened |= FILE_CREATED;
 
 	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
@@ -3562,7 +3562,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	struct nfs4_label l, *ilabel = NULL;
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
-	int opened = 0;
 	int status = 0;
 
 	ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3572,7 +3571,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
 
 	sattr->ia_mode &= ~current_umask();
-	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
+	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
 		goto out;
-- 
cgit v1.2.3


From 8c61282ff61c28d5a12bb53f0eaa221d30fd3ae1 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 26 Aug 2015 21:12:58 +0800
Subject: NFS: Get suppattr_exclcreat when getting server capabilities

Create file with attributs as NFS4_CREATE_EXCLUSIVE4_1 mode
depends on suppattr_exclcreat attribut.

v3, same as v2.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c         | 14 +++++++++++++-
 fs/nfs/nfs4xdr.c          | 26 +++++++++++++++++++++-----
 include/linux/nfs_fs_sb.h |  5 +++++
 include/linux/nfs_xdr.h   |  2 ++
 4 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4687661bfbdc..a6a28d45cca4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2893,8 +2893,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
+	u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
 	struct nfs4_server_caps_arg args = {
 		.fhandle = fhandle,
+		.bitmask = bitmask,
 	};
 	struct nfs4_server_caps_res res = {};
 	struct rpc_message msg = {
@@ -2904,10 +2906,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	};
 	int status;
 
+	bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
+		     FATTR4_WORD0_FH_EXPIRE_TYPE |
+		     FATTR4_WORD0_LINK_SUPPORT |
+		     FATTR4_WORD0_SYMLINK_SUPPORT |
+		     FATTR4_WORD0_ACLSUPPORT;
+	if (minorversion)
+		bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 	if (status == 0) {
 		/* Sanity check the server answers */
-		switch (server->nfs_client->cl_minorversion) {
+		switch (minorversion) {
 		case 0:
 			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
 			res.attr_bitmask[2] = 0;
@@ -2960,6 +2970,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->cache_consistency_bitmask[2] = 0;
+		memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
+			sizeof(server->exclcreat_bitmask));
 		server->acl_bitmask = res.acl_bitmask;
 		server->fh_expire_type = res.fh_expire_type;
 	}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c42459e45f62..ad8dde12f23b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2582,6 +2582,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
 				     struct xdr_stream *xdr,
 				     struct nfs4_server_caps_arg *args)
 {
+	const u32 *bitmask = args->bitmask;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
@@ -2589,11 +2590,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fhandle, &hdr);
-	encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-			   FATTR4_WORD0_FH_EXPIRE_TYPE|
-			   FATTR4_WORD0_LINK_SUPPORT|
-			   FATTR4_WORD0_SYMLINK_SUPPORT|
-			   FATTR4_WORD0_ACLSUPPORT, &hdr);
+	encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr);
 	encode_nops(&hdr);
 }
 
@@ -3370,6 +3367,22 @@ out_overflow:
 	return -EIO;
 }
 
+static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
+				 uint32_t *bitmap, uint32_t *bitmask)
+{
+	if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
+		int ret;
+		ret = decode_attr_bitmap(xdr, bitmask);
+		if (unlikely(ret < 0))
+			return ret;
+		bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+	} else
+		bitmask[0] = bitmask[1] = bitmask[2] = 0;
+	dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+		bitmask[0], bitmask[1], bitmask[2]);
+	return 0;
+}
+
 static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
 {
 	__be32 *p;
@@ -4323,6 +4336,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
 		goto xdr_error;
 	if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
 		goto xdr_error;
+	if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
+				res->exclcreat_bitmask)) != 0)
+		goto xdr_error;
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
 	dprintk("%s: xdr returned %d!\n", __func__, -status);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 20bc8e51b161..570a7df2775b 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -173,6 +173,11 @@ struct nfs_server {
 						   set of attributes supported
 						   on this filesystem excluding
 						   the label support bit. */
+	u32			exclcreat_bitmask[3];
+						/* V4 bitmask representing the
+						   set of attributes supported
+						   on this filesystem for the
+						   exclusive create. */
 	u32			cache_consistency_bitmask[3];
 						/* V4 bitmask representing the subset
 						   of change attribute, size, ctime
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b9b530409ff7..0d7c832ec415 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1057,11 +1057,13 @@ struct nfs4_statfs_res {
 struct nfs4_server_caps_arg {
 	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh		       *fhandle;
+	const u32 *			bitmask;
 };
 
 struct nfs4_server_caps_res {
 	struct nfs4_sequence_res	seq_res;
 	u32				attr_bitmask[3];
+	u32				exclcreat_bitmask[3];
 	u32				acl_bitmask;
 	u32				has_links;
 	u32				has_symlinks;
-- 
cgit v1.2.3


From 5334c5bdac926c5f8d89729beccb46fe88eda9e7 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 26 Aug 2015 21:13:37 +0800
Subject: NFS: Send attributes in OPEN request for NFS4_CREATE_EXCLUSIVE4_1

Client sends a SETATTR request after OPEN for updating attributes.
For create file with S_ISGID is set, the S_ISGID in SETATTR will be
ignored at nfs server as chmod of no PERMISSION.

v3, same as v2.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 18 ++++++++++++++----
 fs/nfs/nfs4xdr.c        | 26 ++++++++++++++++++--------
 include/linux/nfs_xdr.h |  2 +-
 3 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a6a28d45cca4..2923abf2fc0c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2307,15 +2307,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
  * fields corresponding to attributes that were used to store the verifier.
  * Make sure we clobber those fields in the later setattr call
  */
-static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+				struct iattr *sattr, struct nfs4_label **label)
 {
-	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+	const u32 *attrset = opendata->o_res.attrset;
+
+	if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
 	    !(sattr->ia_valid & ATTR_ATIME_SET))
 		sattr->ia_valid |= ATTR_ATIME;
 
-	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+	if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
 	    !(sattr->ia_valid & ATTR_MTIME_SET))
 		sattr->ia_valid |= ATTR_MTIME;
+
+	/* Except MODE, it seems harmless of setting twice. */
+	if ((attrset[1] & FATTR4_WORD1_MODE))
+		sattr->ia_valid &= ~ATTR_MODE;
+
+	if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
+		*label = NULL;
 }
 
 static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
@@ -2440,7 +2450,7 @@ static int _nfs4_do_open(struct inode *dir,
 
 	if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
 	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
-		nfs4_exclusive_attrset(opendata, sattr);
+		nfs4_exclusive_attrset(opendata, sattr, &label);
 
 		nfs_fattr_init(opendata->o_res.f_attr);
 		status = nfs4_do_setattr(state->inode, cred,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ad8dde12f23b..a7be571c1666 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1001,7 +1001,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
 
 static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 				const struct nfs4_label *label,
-				const struct nfs_server *server)
+				const struct nfs_server *server,
+				bool excl_check)
 {
 	char owner_name[IDMAP_NAMESZ];
 	char owner_group[IDMAP_NAMESZ];
@@ -1067,6 +1068,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 		bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
 		len += 4;
 	}
+
+	if (excl_check) {
+		const u32 *excl_bmval = server->exclcreat_bitmask;
+		bmval[0] &= excl_bmval[0];
+		bmval[1] &= excl_bmval[1];
+		bmval[2] &= excl_bmval[2];
+
+		if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
+			label = NULL;
+	}
+
 	if (label) {
 		len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
 		bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -1170,7 +1182,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	}
 
 	encode_string(xdr, create->name->len, create->name->name);
-	encode_attrs(xdr, create->attrs, create->label, create->server);
+	encode_attrs(xdr, create->attrs, create->label, create->server, false);
 }
 
 static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1384,18 +1396,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
-	struct iattr dummy;
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
 	switch(arg->createmode) {
 	case NFS4_CREATE_UNCHECKED:
 		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
-		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
 		break;
 	case NFS4_CREATE_GUARDED:
 		*p = cpu_to_be32(NFS4_CREATE_GUARDED);
-		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
 		break;
 	case NFS4_CREATE_EXCLUSIVE:
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1404,8 +1415,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	case NFS4_CREATE_EXCLUSIVE4_1:
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
-		dummy.ia_valid = 0;
-		encode_attrs(xdr, &dummy, arg->label, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
 	}
 }
 
@@ -1661,7 +1671,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
 	encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
 	encode_nfs4_stateid(xdr, &arg->stateid);
-	encode_attrs(xdr, arg->iap, arg->label, server);
+	encode_attrs(xdr, arg->iap, arg->label, server, false);
 }
 
 static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0d7c832ec415..b4392d86d157 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -379,7 +379,7 @@ struct nfs_openargs {
 	struct stateowner_id	id;
 	union {
 		struct {
-			struct iattr *  attrs;    /* UNCHECKED, GUARDED */
+			struct iattr *  attrs;    /* UNCHECKED, GUARDED, EXCLUSIVE4_1 */
 			nfs4_verifier   verifier; /* EXCLUSIVE */
 		};
 		nfs4_stateid	delegation;		/* CLAIM_DELEGATE_CUR */
-- 
cgit v1.2.3


From d13549074cf066d6d5bb29903d044beffea342d3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 27 Aug 2015 20:37:39 -0400
Subject: NFSv4.1/flexfiles: Fix a protocol error in layoutreturn

According to the flexfiles protocol, the layoutreturn should specify an
array of errors in the following format:

struct ff_ioerr4 {
	offset4        ffie_offset;
	length4        ffie_length;
	stateid4       ffie_stateid;
	device_error4  ffie_errors<>;
};

This patch fixes up the code to ensure that our ffie_errors is indeed
encoded as an array (albeit with only a single entry).

Reported-by: Tom Haynes <thomas.haynes@primarydata.com>
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayoutdev.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f13e1969eedd..b28fa4cbea52 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -500,16 +500,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
 					   range->offset, range->length))
 			continue;
 		/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
-		 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+		 * + array length + deviceid(NFS4_DEVICEID4_SIZE)
+		 * + status(4) + opnum(4)
 		 */
 		p = xdr_reserve_space(xdr,
-				24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+				28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
 		if (unlikely(!p))
 			return -ENOBUFS;
 		p = xdr_encode_hyper(p, err->offset);
 		p = xdr_encode_hyper(p, err->length);
 		p = xdr_encode_opaque_fixed(p, &err->stateid,
 					    NFS4_STATEID_SIZE);
+		/* Encode 1 error */
+		*p++ = cpu_to_be32(1);
 		p = xdr_encode_opaque_fixed(p, &err->deviceid,
 					    NFS4_DEVICEID4_SIZE);
 		*p++ = cpu_to_be32(err->status);
-- 
cgit v1.2.3


From 6669cb8bed02ec1b60e80f2e1e317afc28544207 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 27 Aug 2015 20:43:20 -0400
Subject: NFSv4.1/pnfs: Ensure layoutreturn reserves space for the opaque
 payload

The "FIXME" is outdated. Flexfiles does add a payload.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4xdr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a7be571c1666..ff4784c54e04 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -400,7 +400,8 @@ static int nfs4_stat_to_errno(int);
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
 #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
 				encode_stateid_maxsz + \
-				1 /* FIXME: opaque lrf_body always empty at the moment */)
+				1 + \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
 #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
 				1 + decode_stateid_maxsz)
 #define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
-- 
cgit v1.2.3


From 90816d1ddacfb5a8b783f67e2c1a1bc77dc50ff4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 30 Aug 2015 09:53:06 -0700
Subject: NFSv4.1/flexfiles: Don't mark the entire deviceid as bad for file
 errors

If the file was fenced and/or has been deleted on the DS, then we want
to retry pNFS after a layoutreturn with error report. If the server
cannot fix the problem, then we rely on it to tell us so in the
response to the LAYOUTGET.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 4ec624cfcf8b..61ccf1122494 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1075,18 +1075,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
 	if (task->tk_status >= 0)
 		return 0;
 
-	if (task->tk_status != -EJUKEBOX) {
+	switch (task->tk_status) {
+	/* File access problems. Don't mark the device as unavailable */
+	case -EACCES:
+	case -ESTALE:
+	case -EISDIR:
+	case -EBADHANDLE:
+	case -ELOOP:
+	case -ENOSPC:
+		break;
+	case -EJUKEBOX:
+		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+		goto out_retry;
+	default:
 		dprintk("%s DS connection error %d\n", __func__,
 			task->tk_status);
 		nfs4_mark_deviceid_unavailable(devid);
-		if (ff_layout_has_available_ds(lseg))
-			return -NFS4ERR_RESET_TO_PNFS;
-		else
-			return -NFS4ERR_RESET_TO_MDS;
 	}
-
-	if (task->tk_status == -EJUKEBOX)
-		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+	/* FIXME: Need to prevent infinite looping here. */
+	return -NFS4ERR_RESET_TO_PNFS;
+out_retry:
 	task->tk_status = 0;
 	rpc_restart_call(task);
 	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
-- 
cgit v1.2.3


From 4a1e2feb9d246775dee0f78ed5b18826bae2b1c5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 30 Aug 2015 18:37:59 -0700
Subject: NFSv4.1: Fix a protocol issue with CLOSE stateids

According to RFC5661 Section 18.2.4, CLOSE is supposed to return
the zero stateid. This means that nfs_clear_open_stateid_locked()
cannot assume that the result stateid will always match the 'other'
field of the existing open stateid when trying to determine a race
with a parallel OPEN.

Instead, we look at the argument, and check for matches.

Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2923abf2fc0c..366b81c185f6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1231,6 +1231,7 @@ static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
 }
 
 static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+		nfs4_stateid *arg_stateid,
 		nfs4_stateid *stateid, fmode_t fmode)
 {
 	clear_bit(NFS_O_RDWR_STATE, &state->flags);
@@ -1249,8 +1250,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
 	if (stateid == NULL)
 		return;
 	/* Handle races with OPEN */
-	if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
-	    !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+	if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) ||
+	    (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+	    !nfs4_stateid_is_newer(stateid, &state->open_stateid))) {
 		nfs_resync_open_stateid_locked(state);
 		return;
 	}
@@ -1259,10 +1261,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
 	nfs4_stateid_copy(&state->open_stateid, stateid);
 }
 
-static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_clear_open_stateid(struct nfs4_state *state,
+	nfs4_stateid *arg_stateid,
+	nfs4_stateid *stateid, fmode_t fmode)
 {
 	write_seqlock(&state->seqlock);
-	nfs_clear_open_stateid_locked(state, stateid, fmode);
+	nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode);
 	write_sequnlock(&state->seqlock);
 	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
 		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
@@ -2707,7 +2711,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 				goto out_release;
 			}
 	}
-	nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
+	nfs_clear_open_stateid(state, &calldata->arg.stateid,
+			res_stateid, calldata->arg.fmode);
 out_release:
 	nfs_release_seqid(calldata->arg.seqid);
 	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
-- 
cgit v1.2.3


From 4ae93560b11cb2b57ea5732d442458694ab0c168 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 31 Aug 2015 01:25:11 -0700
Subject: NFSv4.1/pnfs: Don't ask for a read layout for an empty file.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4eec540de9ea..c4f918eca3d2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1494,6 +1494,9 @@ pnfs_update_layout(struct inode *ino,
 	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 		goto out;
 
+	if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+		goto out;
+
 	if (pnfs_within_mdsthreshold(ctx, ino, iomode))
 		goto out;
 
-- 
cgit v1.2.3


From 21b874c873b5019db8bb4b4f6aa929c4bac0a398 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 31 Aug 2015 01:19:22 -0700
Subject: NFSv4.1/pnfs: Handle LAYOUTGET return values correctly

According to RFC5661 section 18.43.3, if the server cannot satisfy
the loga_minlength argument to LAYOUTGET, there are 2 cases:
1) If loga_minlength == 0, it returns NFS4ERR_LAYOUTTRYLATER
2) If loga_minlength != 0, it returns NFS4ERR_BADLAYOUT

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 366b81c185f6..51c7164abd1a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7780,11 +7780,20 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	switch (task->tk_status) {
 	case 0:
 		goto out;
+	/*
+	 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
+	 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
+	 */
+	case -NFS4ERR_BADLAYOUT:
+		goto out_overflow;
 	/*
 	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
-	 * (or clients) writing to the same RAID stripe
+	 * (or clients) writing to the same RAID stripe except when
+	 * the minlength argument is 0 (see RFC5661 section 18.43.3).
 	 */
 	case -NFS4ERR_LAYOUTTRYLATER:
+		if (lgp->args.minlength == 0)
+			goto out_overflow;
 	/*
 	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
 	 * existing layout before getting a new one).
@@ -7840,6 +7849,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 		rpc_restart_call_prepare(task);
 out:
 	dprintk("<-- %s\n", __func__);
+	return;
+out_overflow:
+	task->tk_status = -EOVERFLOW;
+	goto out;
 }
 
 static size_t max_response_pages(struct nfs_server *server)
-- 
cgit v1.2.3


From 2d89a1d3c9ff8ceb115f001e66cff9788338ca47 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 31 Aug 2015 02:05:47 -0700
Subject: NFSv4.1/pNFS: Don't request a minimal read layout beyond the end of
 file

If we have a read layout, then sanity check the minimal layout length
so that it does not extend beyond the end of file.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c4f918eca3d2..ba1246433794 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -868,6 +868,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	struct nfs_server *server = NFS_SERVER(ino);
 	struct nfs4_layoutget *lgp;
 	struct pnfs_layout_segment *lseg;
+	loff_t i_size;
 
 	dprintk("--> %s\n", __func__);
 
@@ -875,9 +876,17 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	if (lgp == NULL)
 		return NULL;
 
+	i_size = i_size_read(ino);
+
 	lgp->args.minlength = PAGE_CACHE_SIZE;
 	if (lgp->args.minlength > range->length)
 		lgp->args.minlength = range->length;
+	if (range->iomode == IOMODE_READ) {
+		if (range->offset >= i_size)
+			lgp->args.minlength = 0;
+		else if (i_size - range->offset < lgp->args.minlength)
+			lgp->args.minlength = i_size - range->offset;
+	}
 	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 	lgp->args.range = *range;
 	lgp->args.type = server->pnfs_curr_ld->id;
-- 
cgit v1.2.3


From 972398fa0a5f47c6ee0bde4d6d24b29f90ec888d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Sep 2015 12:03:56 -0700
Subject: NFSv4.1/flexfiles: Fix freeing of mirrors

Mirrors are now shared objects, so we should not be freeing them directly
inside ff_layout_free_lseg(). We should already be doing the right thing
in _ff_layout_free_lseg(), so just let it handle things.

Also ensure that ff_layout_free_mirror() frees the RPC credential if it
is set.

Fixes: 28a0d72c6867 ("Add refcounting to struct nfs4_ff_layout_mirror")
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 61ccf1122494..ee8e7013454f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -213,6 +213,8 @@ static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 {
 	ff_layout_remove_mirror(mirror);
 	kfree(mirror->fh_versions);
+	if (mirror->cred)
+		put_rpccred(mirror->cred);
 	nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
 	kfree(mirror);
 }
@@ -525,21 +527,9 @@ static void
 ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 {
 	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
-	int i;
 
 	dprintk("--> %s\n", __func__);
 
-	for (i = 0; i < fls->mirror_array_cnt; i++) {
-		if (fls->mirror_array[i]) {
-			nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
-			fls->mirror_array[i]->mirror_ds = NULL;
-			if (fls->mirror_array[i]->cred) {
-				put_rpccred(fls->mirror_array[i]->cred);
-				fls->mirror_array[i]->cred = NULL;
-			}
-		}
-	}
-
 	if (lseg->pls_range.iomode == IOMODE_RW) {
 		struct nfs4_flexfile_layout *ffl;
 		struct inode *inode;
-- 
cgit v1.2.3


From 388ef16640cefd202daa723fba02e7c0266f8454 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Sep 2015 03:31:33 -0700
Subject: NFSv4.1/flexfiles: Fix incorrect usage of
 pnfs_generic_mark_devid_invalid()

Unlike the files layout, flexfiles does not test for the NFS_DEVICEID_INVALID
flag. Instead it relies on NFS_DEVICEID_UNAVAILABLE.
Fix is to replace with nfs4_mark_deviceid_unavailable().

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayoutdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index b28fa4cbea52..883d35c86778 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -344,7 +344,7 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
 			__func__, mirror_idx);
 		if (mirror && mirror->mirror_ds) {
 			devid = &mirror->mirror_ds->id_node;
-			pnfs_generic_mark_devid_invalid(devid);
+			nfs4_mark_deviceid_unavailable(devid);
 		}
 		goto out;
 	}
@@ -374,7 +374,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
 			__func__, ds_idx);
 		if (mirror && mirror->mirror_ds) {
 			devid = &mirror->mirror_ds->id_node;
-			pnfs_generic_mark_devid_invalid(devid);
+			nfs4_mark_deviceid_unavailable(devid);
 		}
 		goto out;
 	}
-- 
cgit v1.2.3


From 81d6dc8b3431d298abaef11d8bc64646fc691618 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Sep 2015 02:49:44 -0700
Subject: NFSv4.1/flexfiles: RW layouts are valid only if all mirrors are valid

Unlike read layouts, the writeable layout cannot fall back to using only
one of the mirrors. It need to write to all of them.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayoutdev.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 883d35c86778..b6c21e9fa002 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -528,11 +528,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
 	return 0;
 }
 
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 {
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_deviceid_node *devid;
-	int idx;
+	u32 idx;
 
 	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
 		mirror = FF_LAYOUT_COMP(lseg, idx);
@@ -546,6 +546,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 	return false;
 }
 
+static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs4_deviceid_node *devid;
+	u32 idx;
+
+	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+		mirror = FF_LAYOUT_COMP(lseg, idx);
+		if (!mirror || !mirror->mirror_ds)
+			return false;
+		devid = &mirror->mirror_ds->id_node;
+		if (ff_layout_test_devid_unavailable(devid))
+			return false;
+	}
+
+	return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
+}
+
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+	if (lseg->pls_range.iomode == IOMODE_READ)
+		return  ff_read_layout_has_available_ds(lseg);
+	/* Note: RW layout needs all mirrors available */
+	return ff_rw_layout_has_available_ds(lseg);
+}
+
 module_param(dataserver_retrans, uint, 0644);
 MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
 			"retries a request before it attempts further "
-- 
cgit v1.2.3


From 889d94d49a3cbcbc8c8d07208549fa614d33da76 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Sep 2015 00:58:24 -0700
Subject: NFSv4.1/flexfiles: Mark layout for return if the mirrors are invalid

If a read-write layout has an invalid mirror, then we should
mark it as invalid, and return it.
If a read-only layout has an invalid mirror, then mark it as invalid
and check if there is still at least one valid mirror before we return
it.

Note: Also fix incorrect use of pnfs_generic_mark_devid_invalid().
We really want nfs4_mark_deviceid_unavailable().

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayoutdev.c | 45 ++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 15 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index b6c21e9fa002..e125e55de86d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -172,6 +172,32 @@ out_err:
 	return NULL;
 }
 
+static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
+		struct nfs4_deviceid_node *devid)
+{
+	nfs4_mark_deviceid_unavailable(devid);
+	if (!ff_layout_has_available_ds(lseg))
+		pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+				lseg);
+}
+
+static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
+		struct nfs4_ff_layout_mirror *mirror)
+{
+	if (mirror == NULL || mirror->mirror_ds == NULL) {
+		pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+					lseg);
+		return false;
+	}
+	if (mirror->mirror_ds->ds == NULL) {
+		struct nfs4_deviceid_node *devid;
+		devid = &mirror->mirror_ds->id_node;
+		ff_layout_mark_devid_invalid(lseg, devid);
+		return false;
+	}
+	return true;
+}
+
 static u64
 end_offset(u64 start, u64 len)
 {
@@ -336,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
 {
 	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
 	struct nfs_fh *fh = NULL;
-	struct nfs4_deviceid_node *devid;
 
-	if (mirror == NULL || mirror->mirror_ds == NULL ||
-	    mirror->mirror_ds->ds == NULL) {
-		printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+	if (!ff_layout_mirror_valid(lseg, mirror)) {
+		pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
 			__func__, mirror_idx);
-		if (mirror && mirror->mirror_ds) {
-			devid = &mirror->mirror_ds->id_node;
-			nfs4_mark_deviceid_unavailable(devid);
-		}
 		goto out;
 	}
 
@@ -368,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
 	unsigned int max_payload;
 	rpc_authflavor_t flavor;
 
-	if (mirror == NULL || mirror->mirror_ds == NULL ||
-	    mirror->mirror_ds->ds == NULL) {
-		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+	if (!ff_layout_mirror_valid(lseg, mirror)) {
+		pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
 			__func__, ds_idx);
-		if (mirror && mirror->mirror_ds) {
-			devid = &mirror->mirror_ds->id_node;
-			nfs4_mark_deviceid_unavailable(devid);
-		}
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 4a70316caef7d158445e672e146eb9f1b8c1aeee Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 31 Aug 2015 10:53:33 +0800
Subject: nfs: Fix truncated client owner id without proto type

The length of "Linux NFSv4.0 " is 14, not 10.

Without this patch, I get a truncated client owner id as,
"Linux NFSv4.0 ::1/::1"

With this patch,
"Linux NFSv4.0 ::1/::1 tcp"

Fixes: a319268891 ("nfs: make nfs4_init_nonuniform_client_string use a dynamically allocated buffer")
Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 51c7164abd1a..3f73539579e5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5020,7 +5020,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 		return 0;
 retry:
 	rcu_read_lock();
-	len = 10 + strlen(clp->cl_ipaddr) + 1 +
+	len = 14 + strlen(clp->cl_ipaddr) + 1 +
 		strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
 		1 +
 		strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
-- 
cgit v1.2.3


From 4a3e5779cf6c6d557682b499c2190ad04c80c6fd Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 31 Aug 2015 10:53:43 +0800
Subject: nfs: Remove unneeded checking of the return value from scnprintf

The return value from scnprintf always less than the buffer length.
So, result >= len always false. This patch removes those checking.

int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        int i;

	i = vsnprintf(buf, size, fmt, args);

	if (likely(i < size))
		return i;
	if (size != 0)
		return size - 1;
	return 0;
}

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3f73539579e5..693b903b48bd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5014,11 +5014,10 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 	int result;
 	size_t len;
 	char *str;
-	bool retried = false;
 
 	if (clp->cl_owner_id != NULL)
 		return 0;
-retry:
+
 	rcu_read_lock();
 	len = 14 + strlen(clp->cl_ipaddr) + 1 +
 		strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
@@ -5046,14 +5045,6 @@ retry:
 			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
 	rcu_read_unlock();
 
-	/* Did something change? */
-	if (result >= len) {
-		kfree(str);
-		if (retried)
-			return -EINVAL;
-		retried = true;
-		goto retry;
-	}
 	clp->cl_owner_id = str;
 	return 0;
 }
@@ -5085,10 +5076,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
 			clp->rpc_ops->version, clp->cl_minorversion,
 			nfs4_client_id_uniquifier,
 			clp->cl_rpcclient->cl_nodename);
-	if (result >= len) {
-		kfree(str);
-		return -EINVAL;
-	}
 	clp->cl_owner_id = str;
 	return 0;
 }
@@ -5124,10 +5111,6 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
 	result = scnprintf(str, len, "Linux NFSv%u.%u %s",
 			clp->rpc_ops->version, clp->cl_minorversion,
 			clp->cl_rpcclient->cl_nodename);
-	if (result >= len) {
-		kfree(str);
-		return -EINVAL;
-	}
 	clp->cl_owner_id = str;
 	return 0;
 }
-- 
cgit v1.2.3


From f95c03b2d5fb6d56c0d7ec21970329b49b657fed Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 2 Sep 2015 15:15:11 -0700
Subject: NFSv4.1/flexfiles: Mark the layout for return in
 ff_layout_io_track_ds_error()

When I/O cannot complete due to a fatal error on the DS, ensure that we
invalidate the corresponding layout segment and return it.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ee8e7013454f..3f073a7de870 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1148,6 +1148,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 				       mirror, offset, length, status, opnum,
 				       GFP_NOIO);
+	pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
 	dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
 }
 
@@ -1156,7 +1157,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 static int ff_layout_read_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 {
-	struct inode *inode;
 	int err;
 
 	trace_nfs4_pnfs_read(hdr, task->tk_status);
@@ -1176,8 +1176,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 		pnfs_read_resend_pnfs(hdr);
 		return task->tk_status;
 	case -NFS4ERR_RESET_TO_MDS:
-		inode = hdr->lseg->pls_layout->plh_inode;
-		pnfs_error_mark_layout_for_return(inode, hdr->lseg);
 		ff_layout_reset_read(hdr);
 		return task->tk_status;
 	case -EAGAIN:
@@ -1330,7 +1328,6 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
 static int ff_layout_write_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 {
-	struct inode *inode;
 	int err;
 
 	trace_nfs4_pnfs_write(hdr, task->tk_status);
@@ -1346,8 +1343,6 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 	switch (err) {
 	case -NFS4ERR_RESET_TO_PNFS:
 	case -NFS4ERR_RESET_TO_MDS:
-		inode = hdr->lseg->pls_layout->plh_inode;
-		pnfs_error_mark_layout_for_return(inode, hdr->lseg);
 		if (err == -NFS4ERR_RESET_TO_PNFS) {
 			pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
 			ff_layout_reset_write(hdr, true);
@@ -1376,7 +1371,6 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 static int ff_layout_commit_done_cb(struct rpc_task *task,
 				     struct nfs_commit_data *data)
 {
-	struct inode *inode;
 	int err;
 
 	trace_nfs4_pnfs_commit_ds(data, task->tk_status);
@@ -1391,8 +1385,6 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 	switch (err) {
 	case -NFS4ERR_RESET_TO_PNFS:
 	case -NFS4ERR_RESET_TO_MDS:
-		inode = data->lseg->pls_layout->plh_inode;
-		pnfs_error_mark_layout_for_return(inode, data->lseg);
 		if (err == -NFS4ERR_RESET_TO_PNFS)
 			pnfs_set_retry_layoutget(data->lseg->pls_layout);
 		else
-- 
cgit v1.2.3


From 7cc8c5cde0a5872f5d013f82978b73c011d8f8f1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 2 Sep 2015 15:22:48 -0700
Subject: NFSv4.1/flexfiles: Clean up
 ff_layout_write_done_cb/ff_layout_commit_done_cb

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 3f073a7de870..fbc5a56de875 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1342,14 +1342,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 
 	switch (err) {
 	case -NFS4ERR_RESET_TO_PNFS:
+		pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+		ff_layout_reset_write(hdr, true);
+		return task->tk_status;
 	case -NFS4ERR_RESET_TO_MDS:
-		if (err == -NFS4ERR_RESET_TO_PNFS) {
-			pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
-			ff_layout_reset_write(hdr, true);
-		} else {
-			pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
-			ff_layout_reset_write(hdr, false);
-		}
+		pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+		ff_layout_reset_write(hdr, false);
 		return task->tk_status;
 	case -EAGAIN:
 		rpc_restart_call_prepare(task);
@@ -1384,11 +1382,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 
 	switch (err) {
 	case -NFS4ERR_RESET_TO_PNFS:
+		pnfs_set_retry_layoutget(data->lseg->pls_layout);
+		pnfs_generic_prepare_to_resend_writes(data);
+		return -EAGAIN;
 	case -NFS4ERR_RESET_TO_MDS:
-		if (err == -NFS4ERR_RESET_TO_PNFS)
-			pnfs_set_retry_layoutget(data->lseg->pls_layout);
-		else
-			pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+		pnfs_clear_retry_layoutget(data->lseg->pls_layout);
 		pnfs_generic_prepare_to_resend_writes(data);
 		return -EAGAIN;
 	case -EAGAIN:
-- 
cgit v1.2.3


From 5cf9d70659594e1a75b34d18619d0bb6e0cbbafa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 4 Sep 2015 15:07:37 -0400
Subject: NFS: Optimise away the close-to-open getattr if there is no cached
 data

If there is no cached data, then there is no need to track the file
change attribute on close.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/inode.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 99a68bd9c178..6307d8de103d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -761,11 +761,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context);
  * @ctx: pointer to context
  * @is_sync: is this a synchronous close
  *
- * always ensure that the attributes are up to date if we're mounted
- * with close-to-open semantics
+ * Ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics and we have cached data that will
+ * need to be revalidated on open.
  */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 {
+	struct nfs_inode *nfsi;
 	struct inode *inode;
 	struct nfs_server *server;
 
@@ -774,7 +776,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 	if (!is_sync)
 		return;
 	inode = d_inode(ctx->dentry);
-	if (!list_empty(&NFS_I(inode)->open_files))
+	nfsi = NFS_I(inode);
+	if (inode->i_mapping->nrpages == 0)
+		return;
+	if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+		return;
+	if (!list_empty(&nfsi->open_files))
 		return;
 	server = NFS_SERVER(inode);
 	if (server->flags & NFS_MOUNT_NOCTO)
-- 
cgit v1.2.3


From 4eae50143bcbfda819c650b7ed6739f3b6338ffc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 4 Sep 2015 15:17:53 -0400
Subject: Revert "NFS: Make close(2) asynchronous when closing NFS O_DIRECT
 files"

This reverts commit f895c53f8ace3c3e49ebf9def90e63fc6d46d2bf.

This commit causes a NFSv4 regression in that close()+unlink() can end
up failing. The reason is that we no longer have a guarantee that the
CLOSE has completed on the server, meaning that the subsequent call to
REMOVE may fail with NFS4ERR_FILE_OPEN if the server implements Windows
unlink() semantics.

Reported-by: <Olga Kornievskaia <aglo@umich.edu>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6307d8de103d..326d9e10d833 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -853,6 +853,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 }
 EXPORT_SYMBOL_GPL(put_nfs_open_context);
 
+static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
+{
+	__put_nfs_open_context(ctx, 1);
+}
+
 /*
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
@@ -908,7 +913,7 @@ void nfs_file_clear_open_context(struct file *filp)
 		spin_lock(&inode->i_lock);
 		list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
 		spin_unlock(&inode->i_lock);
-		__put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
+		put_nfs_open_context_sync(ctx);
 	}
 }
 
-- 
cgit v1.2.3


From 7d160a6c462c2c690e074c173b43aad7204049ad Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 5 Sep 2015 19:06:57 -0400
Subject: NFSv4: Express delegation limit in units of pages

Since we're tracking modifications to the page cache on a per-page
basis, it makes sense to express the limit to how much we may cache
in units of pages.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/delegation.c     |  4 ++--
 fs/nfs/delegation.h     |  2 +-
 fs/nfs/nfs4xdr.c        | 16 ++++++++++------
 include/linux/nfs_xdr.h |  2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 029d688a969f..cd503cc2251c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
 		if (delegation->inode != NULL) {
 			nfs4_stateid_copy(&delegation->stateid, &res->delegation);
 			delegation->type = res->delegation_type;
-			delegation->maxsize = res->maxsize;
+			delegation->pagemod_limit = res->pagemod_limit;
 			oldcred = delegation->cred;
 			delegation->cred = get_rpccred(cred);
 			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
@@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 		return -ENOMEM;
 	nfs4_stateid_copy(&delegation->stateid, &res->delegation);
 	delegation->type = res->delegation_type;
-	delegation->maxsize = res->maxsize;
+	delegation->pagemod_limit = res->pagemod_limit;
 	delegation->change_attr = inode->i_version;
 	delegation->cred = get_rpccred(cred);
 	delegation->inode = inode;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index e3c20a3ccc93..554178a17376 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -18,7 +18,7 @@ struct nfs_delegation {
 	struct inode *inode;
 	nfs4_stateid stateid;
 	fmode_t type;
-	loff_t maxsize;
+	unsigned long pagemod_limit;
 	__u64 change_attr;
 	unsigned long flags;
 	spinlock_t lock;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ff4784c54e04..788adf3897c7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4932,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr)
 }
 
 /* This is too sick! */
-static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
+static int decode_space_limit(struct xdr_stream *xdr,
+		unsigned long *pagemod_limit)
 {
 	__be32 *p;
 	uint32_t limit_type, nblocks, blocksize;
+	u64 maxsize = 0;
 
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 		goto out_overflow;
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
-	case 1:
-		xdr_decode_hyper(p, maxsize);
+	case NFS4_LIMIT_SIZE:
+		xdr_decode_hyper(p, &maxsize);
 		break;
-	case 2:
+	case NFS4_LIMIT_BLOCKS:
 		nblocks = be32_to_cpup(p++);
 		blocksize = be32_to_cpup(p);
-		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+		maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
+	maxsize >>= PAGE_CACHE_SHIFT;
+	*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -4977,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
 		break;
 	case NFS4_OPEN_DELEGATE_WRITE:
 		res->delegation_type = FMODE_WRITE|FMODE_READ;
-		if (decode_space_limit(xdr, &res->maxsize) < 0)
+		if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
 				return -EIO;
 	}
 	return decode_ace(xdr, NULL, res->server->nfs_client);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b4392d86d157..52faf7e96c65 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -406,8 +406,8 @@ struct nfs_openres {
 	const struct nfs_server *server;
 	fmode_t			delegation_type;
 	nfs4_stateid		delegation;
+	unsigned long		pagemod_limit;
 	__u32			do_recall;
-	__u64			maxsize;
 	__u32			attrset[NFS4_BITMAP_SIZE];
 	struct nfs4_string	*owner;
 	struct nfs4_string	*group_owner;
-- 
cgit v1.2.3


From 5445b1fbd123420bffed5e629a420aa2a16bf849 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 5 Sep 2015 19:06:58 -0400
Subject: NFSv4: Respect the server imposed limit on how many changes we may
 cache

The NFSv4 delegation spec allows the server to tell a client to limit how
much data it cache after the file is closed. In return, the server
guarantees enough free space to avoid ENOSPC situations, etc.
Prior to this patch, we assumed we could always cache aggressively after
close. Unfortunately, this causes problems with servers that set the
limit to 0 and therefore do not offer any ENOSPC guarantees.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/delegation.c | 25 +++++++++++++++++++++++++
 fs/nfs/delegation.h |  1 +
 fs/nfs/file.c       | 10 +---------
 fs/nfs/internal.h   |  1 -
 fs/nfs/nfs4file.c   | 29 ++++++++++++++++++++++++++++-
 5 files changed, 55 insertions(+), 11 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cd503cc2251c..2714ef835bdd 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
 	rcu_read_unlock();
 	return ret;
 }
+
+/**
+ * nfs4_delegation_flush_on_close - Check if we must flush file on close
+ * @inode: inode to check
+ *
+ * This function checks the number of outstanding writes to the file
+ * against the delegation 'space_limit' field to see if
+ * the spec requires us to flush the file on close.
+ */
+bool nfs4_delegation_flush_on_close(const struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation;
+	bool ret = true;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(nfsi->delegation);
+	if (delegation == NULL || !(delegation->type & FMODE_WRITE))
+		goto out;
+	if (nfsi->nrequests < delegation->pagemod_limit)
+		ret = false;
+out:
+	rcu_read_unlock();
+	return ret;
+}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 554178a17376..a44829173e57 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
 int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+bool nfs4_delegation_flush_on_close(const struct inode *inode);
 
 #endif
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 526a2681d975..c0f9b1ed12b9 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek);
 /*
  * Flush all dirty pages, and check for write errors.
  */
-int
+static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
 	struct inode	*inode = file_inode(file);
@@ -153,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	if ((file->f_mode & FMODE_WRITE) == 0)
 		return 0;
 
-	/*
-	 * If we're holding a write delegation, then just start the i/o
-	 * but don't wait for completion (or send a commit).
-	 */
-	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
-		return filemap_fdatawrite(file->f_mapping);
-
 	/* Flush writes to the server and return any errors */
 	return vfs_fsync(file, 0);
 }
-EXPORT_SYMBOL_GPL(nfs_file_flush);
 
 ssize_t
 nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9ab3b1c21bb4..56cfde26fb9c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -360,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
 /* file.c */
 int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
 loff_t nfs_file_llseek(struct file *, loff_t, int);
-int nfs_file_flush(struct file *, fl_owner_t);
 ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
 ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
 			     size_t, unsigned int);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 43f1590b9240..b0dbe0abed53 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -6,7 +6,9 @@
 #include <linux/fs.h>
 #include <linux/falloc.h>
 #include <linux/nfs_fs.h>
+#include "delegation.h"
 #include "internal.h"
+#include "iostat.h"
 #include "fscache.h"
 #include "pnfs.h"
 
@@ -99,6 +101,31 @@ out_drop:
 	goto out_put_ctx;
 }
 
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs4_file_flush(struct file *file, fl_owner_t id)
+{
+	struct inode	*inode = file_inode(file);
+
+	dprintk("NFS: flush(%pD2)\n", file);
+
+	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+	if ((file->f_mode & FMODE_WRITE) == 0)
+		return 0;
+
+	/*
+	 * If we're holding a write delegation, then check if we're required
+	 * to flush the i/o on close. If not, then just start the i/o now.
+	 */
+	if (!nfs4_delegation_flush_on_close(inode))
+		return filemap_fdatawrite(file->f_mapping);
+
+	/* Flush writes to the server and return any errors */
+	return vfs_fsync(file, 0);
+}
+
 static int
 nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
@@ -177,7 +204,7 @@ const struct file_operations nfs4_file_operations = {
 	.write_iter	= nfs_file_write,
 	.mmap		= nfs_file_mmap,
 	.open		= nfs4_file_open,
-	.flush		= nfs_file_flush,
+	.flush		= nfs4_file_flush,
 	.release	= nfs_file_release,
 	.fsync		= nfs4_file_fsync,
 	.lock		= nfs_lock,
-- 
cgit v1.2.3