From a41cbe86df3afbc82311a1640e20858c0cd7e065 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 14 Sep 2015 19:54:36 -0400 Subject: Failing to send a CLOSE if file is opened WRONLY and server reboots on a 4.x mount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A test case is as the description says: open(foobar, O_WRONLY); sleep() --> reboot the server close(foobar) The bug is because in nfs4state.c in nfs4_reclaim_open_state() a few line before going to restart, there is clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &state->flags). NFS4CLNT_RECLAIM_NOGRACE is a flag for the client states not open owner states. Value of NFS4CLNT_RECLAIM_NOGRACE is 4 which is the value of NFS_O_WRONLY_STATE in nfs4_state->flags. So clearing it wipes out state and when we go to close it, “call_close” doesn’t get set as state flag is not set and CLOSE doesn’t go on the wire. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index da73bc443238..5db324635e92 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1481,7 +1481,7 @@ restart: spin_unlock(&state->state_lock); } nfs4_put_open_state(state); - clear_bit(NFS4CLNT_RECLAIM_NOGRACE, + clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); spin_lock(&sp->so_lock); goto restart; -- cgit v1.2.3 From 048883e0b934d9a5103d40e209cb14b7f33d2933 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 11 Sep 2015 11:14:06 +0800 Subject: nfs: fix pg_test page count calculation We really want sizeof(struct page *) instead. Otherwise we limit maximum IO size to 64 pages rather than 512 pages on a 64bit system. Fixes 2e11f829(nfs: cap request size to fit a kmalloced page array). Cc: Christoph Hellwig Signed-off-by: Peng Tao Fixes: 2e11f8296d22 ("nfs: cap request size to fit a kmalloced page array") Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7c5718ba625e..fe3ddd20ff89 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -508,7 +508,7 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, * for it without upsetting the slab allocator. */ if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) * - sizeof(struct page) > PAGE_SIZE) + sizeof(struct page *) > PAGE_SIZE) return 0; return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes); -- cgit v1.2.3 From 306a5549355966e480e0dcacdc6b9321d153e0c0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 16 Sep 2015 17:21:27 -0400 Subject: nfs: fix v4.2 SEEK on files over 2 gigs We're incorrectly assigning a loff_t return to an int. If SEEK_HOLE or SEEK_DATA returns an offset over 2^31 then the application will see a weird lseek() result (usually -EIO). Cc: stable@vger.kernel.org Fixes: bdcc2cd14e4e "NFSv4.2: handle NFS-specific llseek errors" Signed-off-by: J. Bruce Fields Reviewed-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index d731bbf974aa..0f020e4d8421 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -175,10 +175,12 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct nfs_server *server = NFS_SERVER(file_inode(filep)); struct nfs4_exception exception = { }; - int err; + loff_t err; do { err = _nfs42_proc_llseek(filep, offset, whence); + if (err >= 0) + break; if (err == -ENOTSUPP) return -EOPNOTSUPP; err = nfs4_handle_exception(server, err, &exception); -- cgit v1.2.3 From 3ec0c97959abff33a42db9081c22132bcff5b4f2 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Mon, 14 Sep 2015 20:12:21 +0800 Subject: nfs/filelayout: Fix NULL reference caused by double freeing of fh_array If filelayout_decode_layout fail, _filelayout_free_lseg will causes a double freeing of fh_array. [ 1179.279800] BUG: unable to handle kernel NULL pointer dereference at (null) [ 1179.280198] IP: [] filelayout_free_fh_array.isra.11+0x1d/0x70 [nfs_layout_nfsv41_files] [ 1179.281010] PGD 0 [ 1179.281443] Oops: 0000 [#1] [ 1179.281831] Modules linked in: nfs_layout_nfsv41_files(OE) nfsv4(OE) nfs(OE) fscache(E) xfs libcrc32c coretemp nfsd crct10dif_pclmul ppdev crc32_pclmul crc32c_intel auth_rpcgss ghash_clmulni_intel nfs_acl lockd vmw_balloon grace sunrpc parport_pc vmw_vmci parport shpchp i2c_piix4 vmwgfx drm_kms_helper ttm drm serio_raw mptspi scsi_transport_spi mptscsih e1000 mptbase ata_generic pata_acpi [last unloaded: fscache] [ 1179.283891] CPU: 0 PID: 13336 Comm: cat Tainted: G OE 4.3.0-rc1-pnfs+ #244 [ 1179.284323] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/20/2014 [ 1179.285206] task: ffff8800501d48c0 ti: ffff88003e3c4000 task.ti: ffff88003e3c4000 [ 1179.285668] RIP: 0010:[] [] filelayout_free_fh_array.isra.11+0x1d/0x70 [nfs_layout_nfsv41_files] [ 1179.286612] RSP: 0018:ffff88003e3c77f8 EFLAGS: 00010202 [ 1179.287092] RAX: 0000000000000000 RBX: ffff88001fe78900 RCX: 0000000000000000 [ 1179.287731] RDX: ffffea0000f40760 RSI: ffff88001fe789c8 RDI: ffff88001fe789c0 [ 1179.288383] RBP: ffff88003e3c7810 R08: ffffea0000f40760 R09: 0000000000000000 [ 1179.289170] R10: 0000000000000000 R11: 0000000000000001 R12: ffff88001fe789c8 [ 1179.289959] R13: ffff88001fe789c0 R14: ffff88004ec05a80 R15: ffff88004f935b88 [ 1179.290791] FS: 00007f4e66bb5700(0000) GS:ffffffff81c29000(0000) knlGS:0000000000000000 [ 1179.291580] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1179.292209] CR2: 0000000000000000 CR3: 00000000203f8000 CR4: 00000000001406f0 [ 1179.292731] Stack: [ 1179.293195] ffff88001fe78900 00000000000000d0 ffff88001fe78178 ffff88003e3c7868 [ 1179.293676] ffffffffa0272737 0000000000000001 0000000000000001 ffff88001fe78800 [ 1179.294151] 00000000614fffce ffffffff81727671 ffff88001fe78100 ffff88001fe78100 [ 1179.294623] Call Trace: [ 1179.295092] [] filelayout_alloc_lseg+0xa7/0x2d0 [nfs_layout_nfsv41_files] [ 1179.295625] [] ? out_of_line_wait_on_bit+0x81/0xb0 [ 1179.296133] [] pnfs_layout_process+0xae/0x320 [nfsv4] [ 1179.296632] [] nfs4_proc_layoutget+0x2b1/0x360 [nfsv4] [ 1179.297134] [] pnfs_update_layout+0x853/0xb30 [nfsv4] [ 1179.297632] [] ? nfs_get_lock_context+0x74/0x170 [nfs] [ 1179.298158] [] filelayout_pg_init_read+0x37/0x50 [nfs_layout_nfsv41_files] [ 1179.298834] [] __nfs_pageio_add_request+0x119/0x460 [nfs] [ 1179.299385] [] ? nfs_create_request.part.9+0x37/0x2e0 [nfs] [ 1179.299872] [] nfs_pageio_add_request+0xa3/0x1b0 [nfs] [ 1179.300362] [] readpage_async_filler+0x85/0x260 [nfs] [ 1179.300907] [] read_cache_pages+0x91/0xd0 [ 1179.301391] [] ? nfs_read_completion+0x220/0x220 [nfs] [ 1179.301867] [] nfs_readpages+0x128/0x200 [nfs] [ 1179.302330] [] __do_page_cache_readahead+0x203/0x280 [ 1179.302784] [] ? __do_page_cache_readahead+0xd8/0x280 [ 1179.303413] [] ondemand_readahead+0x1a6/0x2f0 [ 1179.303855] [] page_cache_sync_readahead+0x31/0x50 [ 1179.304286] [] generic_file_read_iter+0x4a6/0x5c0 [ 1179.304711] [] ? __nfs_revalidate_mapping+0x1f6/0x240 [nfs] [ 1179.305132] [] nfs_file_read+0x52/0xa0 [nfs] [ 1179.305540] [] __vfs_read+0xcc/0x100 [ 1179.305936] [] vfs_read+0x85/0x130 [ 1179.306326] [] SyS_read+0x58/0xd0 [ 1179.306708] [] entry_SYSCALL_64_fastpath+0x12/0x76 [ 1179.307094] Code: c4 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 53 8b 07 49 89 f4 85 c0 74 47 48 8b 06 49 89 fd <48> 8b 38 48 85 ff 74 22 31 db eb 0c 48 63 d3 48 8b 3c d0 48 85 [ 1179.308357] RIP [] filelayout_free_fh_array.isra.11+0x1d/0x70 [nfs_layout_nfsv41_files] [ 1179.309177] RSP [ 1179.309582] CR2: 0000000000000000 Signed-off-by: Kinglong Mee Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index b34f2e228601..02ec07973bc4 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -629,23 +629,18 @@ out_put: goto out; } -static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl) { int i; - for (i = 0; i < fl->num_fh; i++) { - if (!fl->fh_array[i]) - break; - kfree(fl->fh_array[i]); + if (fl->fh_array) { + for (i = 0; i < fl->num_fh; i++) { + if (!fl->fh_array[i]) + break; + kfree(fl->fh_array[i]); + } + kfree(fl->fh_array); } - kfree(fl->fh_array); - fl->fh_array = NULL; -} - -static void -_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) -{ - filelayout_free_fh_array(fl); kfree(fl); } @@ -716,21 +711,21 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, /* Do we want to use a mempool here? */ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags); if (!fl->fh_array[i]) - goto out_err_free; + goto out_err; p = xdr_inline_decode(&stream, 4); if (unlikely(!p)) - goto out_err_free; + goto out_err; fl->fh_array[i]->size = be32_to_cpup(p++); if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { printk(KERN_ERR "NFS: Too big fh %d received %d\n", i, fl->fh_array[i]->size); - goto out_err_free; + goto out_err; } p = xdr_inline_decode(&stream, fl->fh_array[i]->size); if (unlikely(!p)) - goto out_err_free; + goto out_err; memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); dprintk("DEBUG: %s: fh len %d\n", __func__, fl->fh_array[i]->size); @@ -739,8 +734,6 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, __free_page(scratch); return 0; -out_err_free: - filelayout_free_fh_array(fl); out_err: __free_page(scratch); return -EIO; -- cgit v1.2.3 From 6f29b9bba7b08c6b1d6f2cc4cf750b342fc1946c Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sun, 20 Sep 2015 23:03:28 +0800 Subject: NFS: Do cleanup before resetting pageio read/write to mds There is a reference leak of layout segment after resetting pageio read/write to mds. Signed-off-by: Kinglong Mee Cc: stable@vger.kernel.org # v4.0+ Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 3 +++ fs/nfs/write.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index ae0ff7a11b40..01b8cc8e8cfc 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -72,6 +72,9 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *mirror; + if (pgio->pg_ops && pgio->pg_ops->pg_cleanup) + pgio->pg_ops->pg_cleanup(pgio); + pgio->pg_ops = &nfs_pgio_rw_ops; /* read path should never have more than one mirror */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 388f48079c43..72624dc4a623 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1351,6 +1351,9 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *mirror; + if (pgio->pg_ops && pgio->pg_ops->pg_cleanup) + pgio->pg_ops->pg_cleanup(pgio); + pgio->pg_ops = &nfs_pgio_rw_ops; nfs_pageio_stop_mirroring(pgio); -- cgit v1.2.3 From 8714d46dc5b95a6a898ec8f1e67237c7995adfc6 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sun, 20 Sep 2015 23:04:22 +0800 Subject: NFS: Fix an infinite loop when layoutget fail with BAD_STATEID If layouget fail with BAD_STATEID, restart should not using the old stateid. But, nfs client choose the layout stateid at first, and then the open stateid. To avoid the infinite loop of using bad stateid for layoutget, this patch sets the layout flag'ss NFS_LAYOUT_INVALID_STID bit to skip choosing the bad layout stateid. Signed-off-by: Kinglong Mee Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 693b903b48bd..10ba6c1968e2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7820,6 +7820,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) * Mark the bad layout state as invalid, then retry * with the current stateid. */ + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); -- cgit v1.2.3 From 24311f884189d42d40354a6f38ca218eb9aeb811 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 20 Sep 2015 10:50:17 -0400 Subject: NFSv4: Recovery of recalled read delegations is broken When a read delegation is being recalled, and we're reclaiming the cached opens, we need to make sure that we only reclaim read-only modes. A previous attempt to do this, relied on retrieving the delegation type from the nfs4_opendata structure. Unfortunately, as Kinglong pointed out, this field can only be set when performing reboot recovery. Furthermore, if we call nfs4_open_recover(), then we end up clobbering the state->flags for all modes that we're not recovering... The fix is to have the delegation recall code pass this information to the recovery call, and then refactor the recovery code so that nfs4_open_delegation_recall() does not need to call nfs4_open_recover(). Reported-by: Kinglong Mee Fixes: 39f897fdbd46 ("NFSv4: When returning a delegation, don't...") Tested-by: Kinglong Mee Cc: NeilBrown Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 8 ++++-- fs/nfs/delegation.h | 2 +- fs/nfs/nfs4proc.c | 81 +++++++++++++++++++++++++++++++---------------------- 3 files changed, 53 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 2714ef835bdd..be806ead7f4d 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -113,7 +113,8 @@ out: return status; } -static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) +static int nfs_delegation_claim_opens(struct inode *inode, + const nfs4_stateid *stateid, fmode_t type) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; @@ -140,7 +141,7 @@ again: /* Block nfs4_proc_unlck */ mutex_lock(&sp->so_delegreturn_mutex); seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); - err = nfs4_open_delegation_recall(ctx, state, stateid); + err = nfs4_open_delegation_recall(ctx, state, stateid, type); if (!err) err = nfs_delegation_claim_locks(ctx, state, stateid); if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) @@ -411,7 +412,8 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation do { if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) break; - err = nfs_delegation_claim_opens(inode, &delegation->stateid); + err = nfs_delegation_claim_opens(inode, &delegation->stateid, + delegation->type); if (!issync || err != -EAGAIN) break; /* diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index a44829173e57..333063e032f0 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -54,7 +54,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 10ba6c1968e2..e129347b656f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1127,6 +1127,21 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) return ret; } +static bool nfs4_mode_match_open_stateid(struct nfs4_state *state, + fmode_t fmode) +{ + switch(fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ|FMODE_WRITE: + return state->n_rdwr != 0; + case FMODE_WRITE: + return state->n_wronly != 0; + case FMODE_READ: + return state->n_rdonly != 0; + } + WARN_ON_ONCE(1); + return false; +} + static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) { int ret = 0; @@ -1571,17 +1586,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context return opendata; } -static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res) +static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, + fmode_t fmode) { struct nfs4_state *newstate; int ret; - if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR || - opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) && - (opendata->o_arg.u.delegation_type & fmode) != fmode) - /* This mode can't have been delegated, so we must have - * a valid open_stateid to cover it - not need to reclaim. - */ + if (!nfs4_mode_match_open_stateid(opendata->state, fmode)) return 0; opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; @@ -1597,14 +1608,14 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod newstate = nfs4_opendata_to_nfs4_state(opendata); if (IS_ERR(newstate)) return PTR_ERR(newstate); + if (newstate != opendata->state) + ret = -ESTALE; nfs4_close_state(newstate, fmode); - *res = newstate; - return 0; + return ret; } static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state) { - struct nfs4_state *newstate; int ret; /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ @@ -1615,27 +1626,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * clear_bit(NFS_DELEGATED_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); - if (state->n_rdwr != 0) { - ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); - if (ret != 0) - return ret; - if (newstate != state) - return -ESTALE; - } - if (state->n_wronly != 0) { - ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); - if (ret != 0) - return ret; - if (newstate != state) - return -ESTALE; - } - if (state->n_rdonly != 0) { - ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); - if (ret != 0) - return ret; - if (newstate != state) - return -ESTALE; - } + ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE); + if (ret != 0) + return ret; + ret = nfs4_open_recover_helper(opendata, FMODE_WRITE); + if (ret != 0) + return ret; + ret = nfs4_open_recover_helper(opendata, FMODE_READ); + if (ret != 0) + return ret; /* * We may have performed cached opens for all three recoveries. * Check if we need to update the current stateid. @@ -1759,18 +1758,32 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return err; } -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, + struct nfs4_state *state, const nfs4_stateid *stateid, + fmode_t type) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_opendata *opendata; - int err; + int err = 0; opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_DELEG_CUR_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); - err = nfs4_open_recover(opendata, state); + clear_bit(NFS_DELEGATED_STATE, &state->flags); + switch (type & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ|FMODE_WRITE: + case FMODE_WRITE: + err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE); + if (err) + break; + err = nfs4_open_recover_helper(opendata, FMODE_WRITE); + if (err) + break; + case FMODE_READ: + err = nfs4_open_recover_helper(opendata, FMODE_READ); + } nfs4_opendata_put(opendata); return nfs4_handle_delegation_recall_error(server, state, stateid, err); } -- cgit v1.2.3 From 2259f960b3a9b1020dccbf948c97311ce586db1b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 20 Sep 2015 13:30:30 -0400 Subject: NFSv4.x/pnfs: Don't try to recover stateids twice in layoutget If the current open or layout stateid doesn't match the stateid used in the layoutget RPC call, then don't try to recover it. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e129347b656f..ef6b46e08ac6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7813,20 +7813,23 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", __func__, delay); rpc_delay(task, delay); - task->tk_status = 0; - rpc_restart_call_prepare(task); - goto out; /* Do not call nfs4_async_handle_error() */ + /* Do not call nfs4_async_handle_error() */ + goto out_restart; } break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: spin_lock(&inode->i_lock); - lo = NFS_I(inode)->layout; - if (!lo || list_empty(&lo->plh_segs)) { + if (nfs4_stateid_match(&lgp->args.stateid, + &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); /* If the open stateid was bad, then recover it. */ state = lgp->args.ctx->state; - } else { + break; + } + lo = NFS_I(inode)->layout; + if (lo && nfs4_stateid_match(&lgp->args.stateid, + &lo->plh_stateid)) { LIST_HEAD(head); /* @@ -7837,16 +7840,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); - - task->tk_status = 0; - rpc_restart_call_prepare(task); - } + } else + spin_unlock(&inode->i_lock); + goto out_restart; } if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) - rpc_restart_call_prepare(task); + goto out_restart; out: dprintk("<-- %s\n", __func__); return; +out_restart: + task->tk_status = 0; + rpc_restart_call_prepare(task); + return; out_overflow: task->tk_status = -EOVERFLOW; goto out; -- cgit v1.2.3 From 834e465bba38f2768747bccb5f00e951e72d2bf5 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 22 Sep 2015 06:54:47 +0800 Subject: NFS: Skip checking ds_cinfo.buckets when lseg's commit_through_mds is set When lseg's commit_through_mds is set, pnfs client always WARN once in nfs_direct_select_verf after checking ds_cinfo.nbuckets. nfs should use the DS verf except commit_through_mds is set for layout segment where nbuckets is zero. [17844.666094] ------------[ cut here ]------------ [17844.667071] WARNING: CPU: 0 PID: 21758 at /root/source/linux-pnfs/fs/nfs/direct.c:174 nfs_direct_select_verf+0x5a/0x70 [nfs]() [17844.668650] Modules linked in: nfs_layout_nfsv41_files(OE) nfsv4(OE) nfs(OE) fscache(E) nfsd(OE) xfs libcrc32c btrfs ppdev coretemp crct10dif_pclmul auth_rpcgss crc32_pclmul crc32c_intel nfs_acl ghash_clmulni_intel lockd vmw_balloon xor vmw_vmci grace raid6_pq shpchp sunrpc parport_pc i2c_piix4 parport vmwgfx drm_kms_helper ttm drm serio_raw mptspi e1000 scsi_transport_spi mptscsih mptbase ata_generic pata_acpi [last unloaded: fscache] [17844.686676] CPU: 0 PID: 21758 Comm: kworker/0:1 Tainted: G W OE 4.3.0-rc1-pnfs+ #245 [17844.687352] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/20/2014 [17844.698502] Workqueue: nfsiod rpc_async_release [sunrpc] [17844.699212] 0000000000000009 0000000043e58010 ffff8800454fbc10 ffffffff813680c4 [17844.699990] ffff8800454fbc48 ffffffff8108b49d ffff88004eb20000 ffff88004eb20000 [17844.700844] ffff880062e26000 0000000000000000 0000000000000001 ffff8800454fbc58 [17844.701637] Call Trace: [17844.725252] [] dump_stack+0x19/0x25 [17844.732693] [] warn_slowpath_common+0x7d/0xb0 [17844.733855] [] warn_slowpath_null+0x1a/0x20 [17844.735015] [] nfs_direct_select_verf+0x5a/0x70 [nfs] [17844.735999] [] nfs_direct_set_hdr_verf+0x23/0x90 [nfs] [17844.736846] [] nfs_direct_write_completion+0x227/0x260 [nfs] [17844.737782] [] nfs_pgio_release+0x1c/0x20 [nfs] [17844.738597] [] pnfs_generic_rw_release+0x23/0x30 [nfsv4] [17844.739486] [] rpc_free_task+0x2a/0x70 [sunrpc] [17844.740326] [] rpc_async_release+0x15/0x20 [sunrpc] [17844.741173] [] process_one_work+0x21c/0x4c0 [17844.741984] [] ? process_one_work+0x16d/0x4c0 [17844.742837] [] worker_thread+0x4a/0x440 [17844.743639] [] ? process_one_work+0x4c0/0x4c0 [17844.744399] [] ? process_one_work+0x4c0/0x4c0 [17844.745176] [] kthread+0xf5/0x110 [17844.745927] [] ? kthread_create_on_node+0x240/0x240 [17844.747105] [] ret_from_fork+0x3f/0x70 [17844.747856] [] ? kthread_create_on_node+0x240/0x240 [17844.748642] ---[ end trace 336a2845d42b83f0 ]--- Signed-off-by: Kinglong Mee Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 38678d9a5cc4..4b1d08f56aba 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -166,8 +166,11 @@ nfs_direct_select_verf(struct nfs_direct_req *dreq, struct nfs_writeverf *verfp = &dreq->verf; #ifdef CONFIG_NFS_V4_1 - if (ds_clp) { - /* pNFS is in use, use the DS verf */ + /* + * pNFS is in use, use the DS verf except commit_through_mds is set + * for layout segment where nbuckets is zero. + */ + if (ds_clp && dreq->ds_cinfo.nbuckets > 0) { if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets) verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf; else -- cgit v1.2.3 From 500d701f336b2771d34e46da7875a4782515a652 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 22 Sep 2015 11:35:22 +0800 Subject: NFS41: make close wait for layoutreturn If we send a layoutreturn asynchronously before close, the close might reach server first and layoutreturn would fail with BADSTATEID because there is nothing keeping the layout stateid alive. Also do not pretend sending layoutreturn if we are not. Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 17 +++++++++++++++++ fs/nfs/pnfs.c | 35 +++++++++++++++++++++++++---------- fs/nfs/pnfs.h | 7 +++++++ 3 files changed, 49 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ef6b46e08ac6..f93b9cdb4934 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2658,6 +2658,15 @@ out: return err; } +static bool +nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task) +{ + if (inode == NULL || !nfs_have_layout(inode)) + return false; + + return pnfs_wait_on_layoutreturn(inode, task); +} + struct nfs4_closedata { struct inode *inode; struct nfs4_state *state; @@ -2776,6 +2785,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_no_action; } + if (nfs4_wait_on_layoutreturn(inode, task)) { + nfs_release_seqid(calldata->arg.seqid); + goto out_wait; + } + if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc) @@ -5321,6 +5335,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; + if (nfs4_wait_on_layoutreturn(d_data->inode, task)) + return; + if (d_data->roc) pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ba1246433794..8abe27165ad0 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1104,20 +1104,15 @@ bool pnfs_roc(struct inode *ino) mark_lseg_invalid(lseg, &tmp_list); found = true; } - /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put - * in pnfs_roc_release(). We don't really send a layoutreturn but - * still want others to view us like we are sending one! - * - * If pnfs_prepare_layoutreturn() fails, it means someone else is doing - * LAYOUTRETURN, so we proceed like there are no layouts to return. - * - * ROC in three conditions: + /* ROC in two conditions: * 1. there are ROC lsegs * 2. we don't send layoutreturn - * 3. no others are sending layoutreturn */ - if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo)) + if (found && !layoutreturn) { + /* lo ref dropped in pnfs_roc_release() */ + pnfs_get_layout_hdr(lo); roc = true; + } out_noroc: spin_unlock(&ino->i_lock); @@ -1172,6 +1167,26 @@ void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) spin_unlock(&ino->i_lock); } +bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; + bool sleep = false; + + /* we might not have grabbed lo reference. so need to check under + * i_lock */ + spin_lock(&ino->i_lock); + lo = nfsi->layout; + if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + sleep = true; + spin_unlock(&ino->i_lock); + + if (sleep) + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + + return sleep; +} + /* * Compare two layout segments for sorting into layout cache. * We want to preferentially return RW over RO layouts, so ensure those diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 78c9351ff117..d1990e90e7a0 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -270,6 +270,7 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier); +bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task); void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); @@ -639,6 +640,12 @@ pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) { } +static inline bool +pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) +{ + return false; +} + static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id) { -- cgit v1.2.3