summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/nfs/direct.c15
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/filelayout/filelayout.c18
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c190
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c16
-rw-r--r--fs/nfs/inode.c6
-rw-r--r--fs/nfs/internal.h14
-rw-r--r--fs/nfs/nfs4proc.c29
-rw-r--r--fs/nfs/nfs4sysctl.c2
-rw-r--r--fs/nfs/pagelist.c69
-rw-r--r--fs/nfs/pnfs.c62
-rw-r--r--fs/nfs/pnfs.h21
-rw-r--r--fs/nfs/read.c43
-rw-r--r--fs/nfs/write.c47
-rw-r--r--include/linux/nfs_fs.h14
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--include/linux/nfs_xdr.h2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c26
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c64
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c174
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c13
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c16
-rw-r--r--net/sunrpc/xprtrdma/transport.c3
-rw-r--r--net/sunrpc/xprtrdma/verbs.c16
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h14
-rw-r--r--net/sunrpc/xprtsock.c14
27 files changed, 652 insertions, 240 deletions
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a9a93927fe3e..7ab7ec9f4eed 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -664,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
req = nfs_list_entry(reqs.next);
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ list_splice_init(&reqs, &failed);
+ goto out_failed;
+ }
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
@@ -671,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_list_add_request(req, &failed);
spin_lock(cinfo.lock);
dreq->flags = 0;
- dreq->error = -EIO;
+ if (desc.pg_error < 0)
+ dreq->error = desc.pg_error;
+ else
+ dreq->error = -EIO;
spin_unlock(cinfo.lock);
}
nfs_release_request(req);
}
nfs_pageio_complete(&desc);
+out_failed:
while (!list_empty(&failed)) {
req = nfs_list_entry(failed.next);
nfs_list_remove_request(req);
@@ -915,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
}
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ nfs_free_request(req);
+ result = desc.pg_error;
+ break;
+ }
nfs_lock_request(req);
req->wb_index = pos >> PAGE_SHIFT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e6ef80ec699c..178ec8da028f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -545,7 +545,7 @@ static int nfs_launder_page(struct page *page)
inode->i_ino, (long long)page_offset(page));
nfs_fscache_wait_on_page_write(nfsi, page);
- return nfs_wb_page(inode, page);
+ return nfs_wb_launder_page(inode, page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 02ec07973bc4..bb1f4e7a3270 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
pnfs_error_mark_layout_for_return(inode, lseg);
+ pnfs_set_lo_fail(lseg);
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
@@ -883,13 +884,19 @@ static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_read_mds(pgio);
@@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index df475d42df77..18c329b84ffb 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -505,9 +505,17 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
}
p = xdr_inline_decode(&stream, 4);
- if (p)
- fls->flags = be32_to_cpup(p);
+ if (!p)
+ goto out_sort_mirrors;
+ fls->flags = be32_to_cpup(p);
+
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_sort_mirrors;
+ for (i=0; i < fls->mirror_array_cnt; i++)
+ fls->mirror_array[i]->report_interval = be32_to_cpup(p);
+out_sort_mirrors:
ff_layout_sort_mirrors(fls);
rc = ff_layout_check_layout(lgr);
if (rc)
@@ -603,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
mirror->start_time = now;
if (ktime_equal(mirror->last_report_time, notime))
mirror->last_report_time = now;
- if (layoutstats_timer != 0)
+ if (mirror->report_interval != 0)
+ report_interval = (s64)mirror->report_interval * 1000LL;
+ else if (layoutstats_timer != 0)
report_interval = (s64)layoutstats_timer * 1000LL;
if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
report_interval) {
@@ -785,13 +795,19 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
/* Use full layout for now */
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
@@ -825,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
int i;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
@@ -867,18 +889,25 @@ static unsigned int
ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ goto out;
+ }
+ }
if (pgio->pg_lseg)
return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
/* no lseg means that pnfs is not in use, so no mirroring here */
nfs_pageio_reset_write_mds(pgio);
+out:
return 1;
}
@@ -1090,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
return -NFS4ERR_RESET_TO_PNFS;
out_retry:
task->tk_status = 0;
- rpc_restart_call(task);
+ rpc_restart_call_prepare(task);
rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
return -EAGAIN;
}
@@ -1148,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
}
+ switch (status) {
+ case NFS4ERR_DELAY:
+ case NFS4ERR_GRACE:
+ return;
+ default:
+ break;
+ }
+
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
@@ -1231,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
return ff_layout_test_devid_unavailable(node);
}
-static int ff_layout_read_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
nfs4_ff_layout_stat_io_start_read(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count,
task->tk_start);
+}
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_read(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ hdr->res.count);
+}
+
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1254,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
}
hdr->pgio_done_cb = ff_layout_read_done_cb;
+ ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1312,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
- nfs4_ff_layout_stat_io_end_read(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1330,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_read_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
}
+static void ff_layout_read_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1351,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
ff_layout_reset_write(hdr, true);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
ff_layout_reset_write(hdr, false);
return task->tk_status;
case -EAGAIN:
- rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -1391,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- pnfs_set_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -NFS4ERR_RESET_TO_MDS:
- pnfs_clear_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -EAGAIN:
@@ -1410,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return 0;
}
-static int ff_layout_write_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
nfs4_ff_layout_stat_io_start_write(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count,
task->tk_start);
+}
+
+static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count, hdr->res.count,
+ hdr->res.verf->committed);
+}
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1434,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EAGAIN;
}
+ ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1469,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count,
- hdr->res.verf->committed);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1488,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_write_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
}
-static void ff_layout_commit_prepare_common(struct rpc_task *task,
+static void ff_layout_write_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
nfs4_ff_layout_stat_io_start_write(cdata->inode,
FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
0, task->tk_start);
}
+static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ struct nfs_page *req;
+ __u64 count = 0;
+
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+
+ if (task->tk_status == 0) {
+ list_for_each_entry(req, &cdata->pages, wb_list)
+ count += req->wb_bytes;
+ }
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ count, count, NFS_FILE_SYNC);
+}
+
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ ff_layout_commit_record_layoutstats_start(task, cdata);
+}
+
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
{
ff_layout_commit_prepare_common(task, data);
@@ -1520,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
static void ff_layout_commit_done(struct rpc_task *task, void *data)
{
- struct nfs_commit_data *cdata = data;
- struct nfs_page *req;
- __u64 count = 0;
-
- if (task->tk_status == 0) {
- list_for_each_entry(req, &cdata->pages, wb_list)
- count += req->wb_bytes;
- }
-
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
- count, count, NFS_FILE_SYNC);
-
pnfs_generic_write_commit_done(task, data);
}
@@ -1540,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
{
struct nfs_commit_data *cdata = data;
+ ff_layout_commit_record_layoutstats_done(task, cdata);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
}
+static void ff_layout_commit_release(void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
+ pnfs_generic_commit_release(data);
+}
+
static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
.rpc_call_prepare = ff_layout_read_prepare_v3,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
.rpc_call_prepare = ff_layout_read_prepare_v4,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
.rpc_call_prepare = ff_layout_write_prepare_v3,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
.rpc_call_prepare = ff_layout_write_prepare_v4,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
.rpc_call_prepare = ff_layout_commit_prepare_v3,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
.rpc_call_prepare = ff_layout_commit_prepare_v4,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static enum pnfs_try_status
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 2bb08bc6aaf0..dd353bb7dc0a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -85,6 +85,7 @@ struct nfs4_ff_layout_mirror {
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
ktime_t last_report_time;
+ u32 report_interval;
};
struct nfs4_ff_layout_segment {
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e125e55de86d..bd0327541366 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -429,22 +429,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
- if (fail_return) {
- pnfs_error_mark_layout_for_return(ino, lseg);
- if (ff_layout_has_available_ds(lseg))
- pnfs_set_retry_layoutget(lseg->pls_layout);
- else
- pnfs_clear_retry_layoutget(lseg->pls_layout);
-
- } else {
+ if (!fail_return) {
if (ff_layout_has_available_ds(lseg))
set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&lseg->pls_layout->plh_flags);
- else {
+ else
pnfs_error_mark_layout_for_return(ino, lseg);
- pnfs_clear_retry_layoutget(lseg->pls_layout);
- }
- }
+ } else
+ pnfs_error_mark_layout_for_return(ino, lseg);
}
out_update_creds:
if (ff_layout_update_mirror_cred(mirror, ds))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c7e8b87da5b2..74fb1223c2f5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -912,6 +912,12 @@ void nfs_file_clear_open_context(struct file *filp)
if (ctx) {
struct inode *inode = d_inode(ctx->dentry);
+ /*
+ * We fatal error on write before. Try to writeback
+ * every page again.
+ */
+ if (ctx->error < 0)
+ invalidate_inode_pages2(inode->i_mapping);
filp->private_data = NULL;
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 870e2ba7ba49..ee81792d2886 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -716,3 +716,17 @@ static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
return 0;
}
#endif
+
+static inline bool nfs_error_is_fatal(int err)
+{
+ switch (err) {
+ case -ERESTARTSYS:
+ case -EIO:
+ case -ENOSPC:
+ case -EROFS:
+ case -E2BIG:
+ return true;
+ default:
+ return false;
+ }
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 883da29b9ace..5e5062c9b92b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5383,6 +5383,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
if (data == NULL)
return -ENOMEM;
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+
+ nfs4_state_protect(server->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
data->args.bitmask = server->cache_consistency_bitmask;
@@ -6859,10 +6864,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
},
.allow.u.words = {
[0] = 1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN) |
1 << (OP_COMMIT),
[1] = 1 << (OP_SECINFO - 32) |
1 << (OP_SECINFO_NO_NAME - 32) |
+ 1 << (OP_LAYOUTRETURN - 32) |
1 << (OP_TEST_STATEID - 32) |
1 << (OP_FREE_STATEID - 32) |
1 << (OP_WRITE - 32)
@@ -6927,11 +6935,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
}
if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+ test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) &&
+ test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
test_bit(OP_LOCKU, sp->allow.u.longs)) {
dfprintk(MOUNT, " cleanup mode enabled\n");
set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
}
+ if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " pnfs cleanup mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &clp->cl_sp4_flags);
+ }
+
if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
dfprintk(MOUNT, " secinfo mode enabled\n");
@@ -7796,6 +7812,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
goto out;
+
+ /*
+ * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs
+ * on the file. set tk_status to -ENODATA to tell upper layer to
+ * retry go inband.
+ */
+ case -NFS4ERR_LAYOUTUNAVAILABLE:
+ task->tk_status = -ENODATA;
+ goto out;
/*
* NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
* length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
@@ -8086,6 +8111,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
};
int status = 0;
+ nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client,
+ NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
dprintk("--> %s\n", __func__);
if (!sync) {
lrp->inode = nfs_igrab_and_active(lrp->args.inode);
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 0fbd3ab1be22..8693d77c45ea 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -12,7 +12,7 @@
#include "nfs4idmap.h"
#include "callback.h"
-static const int nfs_set_port_min = 0;
+static const int nfs_set_port_min;
static const int nfs_set_port_max = 65535;
static struct ctl_table_header *nfs4_callback_sysctl_table;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c3a78450a239..eeddbf0bf4c4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -664,22 +664,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
* @desc: IO descriptor
* @hdr: pageio header
*/
-static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
+static void nfs_pgio_error(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_mirror *mirror;
- u32 midx;
-
set_bit(NFS_IOHDR_REDO, &hdr->flags);
nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
- /* TODO: Make sure it's right to clean up all mirrors here
- * and not just hdr->pgio_mirror_idx */
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- mirror = &desc->pg_mirrors[midx];
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- }
- return -ENOMEM;
}
/**
@@ -800,8 +789,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
unsigned int pagecount, pageused;
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
- if (!nfs_pgarray_set(&hdr->page_array, pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (!nfs_pgarray_set(&hdr->page_array, pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
pages = hdr->page_array.pagevec;
@@ -819,8 +811,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
*pages++ = last_page = req->wb_page;
}
}
- if (WARN_ON_ONCE(pageused != pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (WARN_ON_ONCE(pageused != pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -EINVAL;
+ return desc->pg_error;
+ }
if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -843,10 +838,8 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- /* TODO: make sure this is right with mirroring - or
- * should it back out all mirrors? */
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
@@ -874,6 +867,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+ if (pgio->pg_error < 0)
+ return pgio->pg_error;
+
if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
return -EINVAL;
@@ -976,6 +972,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
} else {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
+ if (desc->pg_error < 0)
+ return 0;
mirror->pg_base = req->wb_pgbase;
}
if (!nfs_can_coalesce_requests(prev, req, desc))
@@ -1141,6 +1139,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
bytes = req->wb_bytes;
nfs_pageio_setup_mirroring(desc, req);
+ if (desc->pg_error < 0)
+ goto out_failed;
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
if (midx) {
@@ -1157,7 +1157,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (IS_ERR(dupreq)) {
nfs_page_group_unlock(req);
- return 0;
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
}
nfs_lock_request(dupreq);
@@ -1170,10 +1171,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (nfs_pgio_has_mirroring(desc))
desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
- return 0;
+ goto out_failed;
}
return 1;
+
+out_failed:
+ /*
+ * We might have failed before sending any reqs over wire.
+ * Clean up rest of the reqs in mirror pg_list.
+ */
+ if (desc->pg_error) {
+ struct nfs_pgio_mirror *mirror;
+ void (*func)(struct list_head *);
+
+ /* remember fatal errors */
+ if (nfs_error_is_fatal(desc->pg_error))
+ mapping_set_error(desc->pg_inode->i_mapping,
+ desc->pg_error);
+
+ func = desc->pg_completion_ops->error_cleanup;
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = &desc->pg_mirrors[midx];
+ func(&mirror->pg_list);
+ }
+ }
+ return 0;
}
/*
@@ -1226,7 +1249,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
nfs_pageio_complete(desc);
if (!list_empty(&failed)) {
list_move(&failed, &hdr->pages);
- return -EIO;
+ return desc->pg_error < 0 ? desc->pg_error : -EIO;
}
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 04db6d951b99..a3592cc34a20 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -618,7 +618,6 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
pnfs_get_layout_hdr(lo);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
- pnfs_clear_retry_layoutget(lo);
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -906,17 +905,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
lseg = nfs4_proc_layoutget(lgp, gfp_flags);
} while (lseg == ERR_PTR(-EAGAIN));
- if (IS_ERR(lseg)) {
- switch (PTR_ERR(lseg)) {
- case -ENOMEM:
- case -ERESTARTSYS:
- break;
- default:
- /* remember that LAYOUTGET failed and suspend trying */
- pnfs_layout_io_set_failed(lo, range->iomode);
- }
- return NULL;
- } else
+ if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
+ lseg = NULL;
+ else
pnfs_layout_clear_fail_bit(lo,
pnfs_iomode_to_fail_bit(range->iomode));
@@ -1104,7 +1095,6 @@ bool pnfs_roc(struct inode *ino)
&lo->plh_flags))
layoutreturn = pnfs_prepare_layoutreturn(lo);
- pnfs_clear_retry_layoutget(lo);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
/* If we are sending layoutreturn, invalidate all valid lsegs */
if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
@@ -1468,25 +1458,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
return ret;
}
-/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
-static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode)
-{
- if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
- return 1;
- return nfs_wait_bit_killable(key, mode);
-}
-
static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
{
- if (!pnfs_should_retry_layoutget(lo))
- return false;
/*
* send layoutcommit as it can hold up layoutreturn due to lseg
* reference
*/
pnfs_layoutcommit_inode(lo->plh_inode, false);
return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
- pnfs_layoutget_retry_bit_wait,
+ nfs_wait_bit_killable,
TASK_UNINTERRUPTIBLE);
}
@@ -1561,8 +1541,7 @@ lookup_again:
}
/* if LAYOUTGET already failed once we don't try again */
- if (pnfs_layout_io_test_failed(lo, iomode) &&
- !pnfs_should_retry_layoutget(lo)) {
+ if (pnfs_layout_io_test_failed(lo, iomode)) {
trace_pnfs_update_layout(ino, pos, count, iomode, lo,
PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
goto out_unlock;
@@ -1639,7 +1618,6 @@ lookup_again:
arg.length = PAGE_CACHE_ALIGN(arg.length);
lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
- pnfs_clear_retry_layoutget(lo);
atomic_dec(&lo->plh_outstanding);
trace_pnfs_update_layout(ino, pos, count, iomode, lo,
PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
@@ -1652,7 +1630,7 @@ out:
"(%s, offset: %llu, length: %llu)\n",
__func__, ino->i_sb->s_id,
(unsigned long long)NFS_FILEID(ino),
- lseg == NULL ? "not found" : "found",
+ IS_ERR_OR_NULL(lseg) ? "not found" : "found",
iomode==IOMODE_RW ? "read/write" : "read-only",
(unsigned long long)pos,
(unsigned long long)count);
@@ -1804,7 +1782,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg)
{
struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
- int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
struct pnfs_layout_range range = {
.iomode = lseg->pls_range.iomode,
.offset = 0,
@@ -1814,8 +1791,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
bool return_now = false;
spin_lock(&inode->i_lock);
- /* set failure bit so that pnfs path will be retried later */
- pnfs_layout_set_fail_bit(lo, iomode);
pnfs_set_plh_return_iomode(lo, range.iomode);
/*
* mark all matching lsegs so that we are sure to have no live
@@ -1856,6 +1831,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
rd_size,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
}
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
@@ -1868,13 +1848,19 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
- if (pgio->pg_lseg == NULL)
+ if (pgio->pg_lseg == NULL) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
req_offset(req),
wb_size,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_write_mds(pgio);
@@ -2042,15 +2028,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
@@ -2173,15 +2157,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 78df618a1596..9f4e2a47f4aa 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -98,7 +98,6 @@ enum {
NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
- NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */
};
enum layoutdriver_policy_flags {
@@ -382,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
return d;
}
-static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
- atomic_inc(&lo->plh_refcount);
-}
-
-static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
- atomic_dec(&lo->plh_refcount);
- /* wake up waiters for LAYOUTRETURN as that is not needed */
- wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
- }
-}
-
-static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
-}
-
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 0a5e33f33b5c..eb31e23e7def 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+static void nfs_readpage_release(struct nfs_page *req)
+{
+ struct inode *inode = d_inode(req->wb_context->dentry);
+
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
+ (long long)req_offset(req));
+
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+ if (PageUptodate(req->wb_page))
+ nfs_readpage_to_fscache(inode, req->wb_page, 0);
+
+ unlock_page(req->wb_page);
+ }
+ nfs_release_request(req);
+}
+
int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
struct page *page)
{
@@ -106,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
- nfs_pageio_add_request(&pgio, new);
+ if (!nfs_pageio_add_request(&pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
+ }
nfs_pageio_complete(&pgio);
/* It doesn't make sense to do mirrored reads! */
@@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
pgm = &pgio.pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
- return 0;
-}
-
-static void nfs_readpage_release(struct nfs_page *req)
-{
- struct inode *inode = d_inode(req->wb_context->dentry);
-
- dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
- (long long)req_offset(req));
-
- if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- if (PageUptodate(req->wb_page))
- nfs_readpage_to_fscache(inode, req->wb_page, 0);
-
- unlock_page(req->wb_page);
- }
- nfs_release_request(req);
+ return pgio.pg_error < 0 ? pgio.pg_error : 0;
}
static void nfs_page_group_set_uptodate(struct nfs_page *req)
@@ -361,6 +364,8 @@ readpage_async_filler(void *data, struct page *page)
if (len < PAGE_CACHE_SIZE)
zero_user_segment(page, len, PAGE_CACHE_SIZE);
if (!nfs_pageio_add_request(desc->pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
error = desc->pgio->pg_error;
goto out_unlock;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2c26e04d9396..94828b3f8c95 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -547,12 +547,22 @@ try_again:
return head;
}
+static void nfs_write_error_remove_page(struct nfs_page *req)
+{
+ nfs_unlock_request(req);
+ nfs_end_page_writeback(req);
+ nfs_release_request(req);
+ generic_error_remove_page(page_file_mapping(req->wb_page),
+ req->wb_page);
+}
+
/*
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page, bool nonblock)
+ struct page *page, bool nonblock,
+ bool launder)
{
struct nfs_page *req;
int ret = 0;
@@ -569,8 +579,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
ret = 0;
if (!nfs_pageio_add_request(pgio, req)) {
- nfs_redirty_request(req);
ret = pgio->pg_error;
+ /*
+ * Remove the problematic req upon fatal errors
+ * in launder case, while other dirty pages can
+ * still be around until they get flushed.
+ */
+ if (nfs_error_is_fatal(ret)) {
+ nfs_context_set_write_error(req->wb_context, ret);
+ if (launder) {
+ nfs_write_error_remove_page(req);
+ goto out;
+ }
+ }
+ nfs_redirty_request(req);
+ ret = -EAGAIN;
} else
nfs_add_stats(page_file_mapping(page)->host,
NFSIOS_WRITEPAGES, 1);
@@ -578,12 +601,14 @@ out:
return ret;
}
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio, bool launder)
{
int ret;
nfs_pageio_cond_complete(pgio, page_file_index(page));
- ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+ ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
+ launder);
if (ret == -EAGAIN) {
redirty_page_for_writepage(wbc, page);
ret = 0;
@@ -594,7 +619,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
/*
* Write an mmapped page to the server.
*/
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_writepage_locked(struct page *page,
+ struct writeback_control *wbc,
+ bool launder)
{
struct nfs_pageio_descriptor pgio;
struct inode *inode = page_file_mapping(page)->host;
@@ -603,7 +630,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
false, &nfs_async_write_completion_ops);
- err = nfs_do_writepage(page, wbc, &pgio);
+ err = nfs_do_writepage(page, wbc, &pgio, launder);
nfs_pageio_complete(&pgio);
if (err < 0)
return err;
@@ -616,7 +643,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
- ret = nfs_writepage_locked(page, wbc);
+ ret = nfs_writepage_locked(page, wbc, false);
unlock_page(page);
return ret;
}
@@ -625,7 +652,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
{
int ret;
- ret = nfs_do_writepage(page, wbc, data);
+ ret = nfs_do_writepage(page, wbc, data, false);
unlock_page(page);
return ret;
}
@@ -1923,7 +1950,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
/*
* Write back all requests on one page - we do this before reading it.
*/
-int nfs_wb_page(struct inode *inode, struct page *page)
+int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
{
loff_t range_start = page_file_offset(page);
loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
@@ -1940,7 +1967,7 @@ int nfs_wb_page(struct inode *inode, struct page *page)
for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
- ret = nfs_writepage_locked(page, &wbc);
+ ret = nfs_writepage_locked(page, &wbc, launder);
if (ret < 0)
goto out_error;
continue;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ebf0bd72a42b..9eee972863a7 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -516,13 +516,25 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
*/
extern int nfs_sync_inode(struct inode *inode);
extern int nfs_wb_all(struct inode *inode);
-extern int nfs_wb_page(struct inode *inode, struct page* page);
+extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder);
extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
extern int nfs_commit_inode(struct inode *, int);
extern struct nfs_commit_data *nfs_commitdata_alloc(void);
extern void nfs_commit_free(struct nfs_commit_data *data);
static inline int
+nfs_wb_launder_page(struct inode *inode, struct page *page)
+{
+ return nfs_wb_single_page(inode, page, true);
+}
+
+static inline int
+nfs_wb_page(struct inode *inode, struct page *page)
+{
+ return nfs_wb_single_page(inode, page, false);
+}
+
+static inline int
nfs_have_writebacks(struct inode *inode)
{
return NFS_I(inode)->nrequests != 0;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2469ab0bb3a1..7fcc13c8cf1f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -102,6 +102,7 @@ struct nfs_client {
#define NFS_SP4_MACH_CRED_STATEID 4 /* TEST_STATEID and FREE_STATEID */
#define NFS_SP4_MACH_CRED_WRITE 5 /* WRITE */
#define NFS_SP4_MACH_CRED_COMMIT 6 /* COMMIT */
+#define NFS_SP4_MACH_CRED_PNFS_CLEANUP 7 /* LAYOUTRETURN */
#endif /* CONFIG_NFS_V4 */
/* Our own IP address, as a null-terminated string.
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index bee3e60a7006..791098a08a87 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1375,6 +1375,7 @@ enum {
NFS_IOHDR_ERROR = 0,
NFS_IOHDR_EOF,
NFS_IOHDR_REDO,
+ NFS_IOHDR_STAT,
};
struct nfs_pgio_header {
@@ -1455,6 +1456,7 @@ struct nfs_commit_data {
const struct rpc_call_ops *mds_ops;
const struct nfs_commit_completion_ops *completion_ops;
int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data);
+ unsigned long flags;
};
struct nfs_pgio_completion_ops {
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcb44f69e53..cc1251d07297 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -15,7 +15,7 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-#define RPCRDMA_BACKCHANNEL_DEBUG
+#undef RPCRDMA_BACKCHANNEL_DEBUG
static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
@@ -42,8 +42,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
size_t size;
req = rpcrdma_create_req(r_xprt);
- if (!req)
- return -ENOMEM;
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req->rl_backchannel = true;
size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
@@ -84,9 +84,7 @@ out_fail:
static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
unsigned int count)
{
- struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
struct rpcrdma_rep *rep;
- unsigned long flags;
int rc = 0;
while (count--) {
@@ -98,9 +96,7 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
break;
}
- spin_lock_irqsave(&buffers->rb_lock, flags);
- list_add(&rep->rr_list, &buffers->rb_recv_bufs);
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ rpcrdma_recv_buffer_put(rep);
}
return rc;
@@ -140,6 +136,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
__func__);
goto out_free;
}
+ dprintk("RPC: %s: new rqst %p\n", __func__, rqst);
rqst->rq_xprt = &r_xprt->rx_xprt;
INIT_LIST_HEAD(&rqst->rq_list);
@@ -220,12 +217,14 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
rpclen = rqst->rq_svec[0].iov_len;
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
__func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
pr_info("RPC: %s: RPC/RDMA: %*ph\n",
__func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
pr_info("RPC: %s: RPC: %*ph\n",
__func__, (int)rpclen, rqst->rq_svec[0].iov_base);
+#endif
req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
@@ -269,6 +268,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
{
struct rpc_xprt *xprt = rqst->rq_xprt;
+ dprintk("RPC: %s: freeing rqst %p (req %p)\n",
+ __func__, rqst, rpcr_to_rdmar(rqst));
+
smp_mb__before_atomic();
WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state));
clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
@@ -333,9 +335,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst, rq_bc_pa_list);
list_del(&rqst->rq_bc_pa_list);
spin_unlock(&xprt->bc_pa_lock);
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
- pr_info("RPC: %s: using rqst %p\n", __func__, rqst);
-#endif
+ dprintk("RPC: %s: using rqst %p\n", __func__, rqst);
/* Prepare rqst */
rqst->rq_reply_bytes_recvd = 0;
@@ -355,10 +355,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
* direction reply.
*/
req = rpcr_to_rdmar(rqst);
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
- pr_info("RPC: %s: attaching rep %p to req %p\n",
+ dprintk("RPC: %s: attaching rep %p to req %p\n",
__func__, rep, req);
-#endif
req->rl_reply = rep;
/* Defeat the retransmit detection logic in send_request */
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index f1e8dafbd507..c14f3a4bff68 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -179,6 +179,69 @@ out_maperr:
return rc;
}
+static void
+__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+ struct ib_device *device = r_xprt->rx_ia.ri_device;
+ struct rpcrdma_mw *mw = seg->rl_mw;
+ int nsegs = seg->mr_nsegs;
+
+ seg->rl_mw = NULL;
+
+ while (nsegs--)
+ rpcrdma_unmap_one(device, seg++);
+
+ rpcrdma_put_mw(r_xprt, mw);
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ */
+static void
+fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct rpcrdma_mr_seg *seg;
+ unsigned int i, nchunks;
+ struct rpcrdma_mw *mw;
+ LIST_HEAD(unmap_list);
+ int rc;
+
+ dprintk("RPC: %s: req %p\n", __func__, req);
+
+ /* ORDER: Invalidate all of the req's MRs first
+ *
+ * ib_unmap_fmr() is slow, so use a single call instead
+ * of one call per mapped MR.
+ */
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+
+ list_add(&mw->r.fmr.fmr->list, &unmap_list);
+
+ i += seg->mr_nsegs;
+ }
+ rc = ib_unmap_fmr(&unmap_list);
+ if (rc)
+ pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+
+ /* ORDER: Now DMA unmap all of the req's MRs, and return
+ * them to the free MW list.
+ */
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+
+ __fmr_dma_unmap(r_xprt, seg);
+
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ }
+
+ req->rl_nchunks = 0;
+}
+
/* Use the ib_unmap_fmr() verb to prevent further remote
* access via RDMA READ or RDMA WRITE.
*/
@@ -231,6 +294,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
+ .ro_unmap_sync = fmr_op_unmap_sync,
.ro_unmap = fmr_op_unmap,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 88cf9e7269c2..c6836844bd0e 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -245,12 +245,14 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
}
-/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
+/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs
+ * to be reset.
+ *
+ * WARNING: Only wr_id and status are reliable at this point
+ */
static void
-frwr_sendcompletion(struct ib_wc *wc)
+__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_mw *r)
{
- struct rpcrdma_mw *r;
-
if (likely(wc->status == IB_WC_SUCCESS))
return;
@@ -261,9 +263,23 @@ frwr_sendcompletion(struct ib_wc *wc)
else
pr_warn("RPC: %s: frmr %p error, status %s (%d)\n",
__func__, r, ib_wc_status_msg(wc->status), wc->status);
+
r->r.frmr.fr_state = FRMR_IS_STALE;
}
+static void
+frwr_sendcompletion(struct ib_wc *wc)
+{
+ struct rpcrdma_mw *r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+ struct rpcrdma_frmr *f = &r->r.frmr;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS))
+ __frwr_sendcompletion_flush(wc, r);
+
+ if (f->fr_waiter)
+ complete(&f->fr_linv_done);
+}
+
static int
frwr_op_init(struct rpcrdma_xprt *r_xprt)
{
@@ -319,7 +335,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
- struct ib_reg_wr reg_wr;
+ struct ib_reg_wr *reg_wr;
struct ib_send_wr *bad_wr;
int rc, i, n, dma_nents;
u8 key;
@@ -335,7 +351,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
} while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
frmr = &mw->r.frmr;
frmr->fr_state = FRMR_IS_VALID;
+ frmr->fr_waiter = false;
mr = frmr->fr_mr;
+ reg_wr = &frmr->fr_regwr;
if (nsegs > ia->ri_max_frmr_depth)
nsegs = ia->ri_max_frmr_depth;
@@ -381,19 +399,19 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
- reg_wr.wr.next = NULL;
- reg_wr.wr.opcode = IB_WR_REG_MR;
- reg_wr.wr.wr_id = (uintptr_t)mw;
- reg_wr.wr.num_sge = 0;
- reg_wr.wr.send_flags = 0;
- reg_wr.mr = mr;
- reg_wr.key = mr->rkey;
- reg_wr.access = writing ?
- IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
- IB_ACCESS_REMOTE_READ;
+ reg_wr->wr.next = NULL;
+ reg_wr->wr.opcode = IB_WR_REG_MR;
+ reg_wr->wr.wr_id = (uintptr_t)mw;
+ reg_wr->wr.num_sge = 0;
+ reg_wr->wr.send_flags = 0;
+ reg_wr->mr = mr;
+ reg_wr->key = mr->rkey;
+ reg_wr->access = writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ;
DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_post_send(ia->ri_id->qp, &reg_wr.wr, &bad_wr);
+ rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
if (rc)
goto out_senderr;
@@ -413,6 +431,116 @@ out_senderr:
return rc;
}
+static struct ib_send_wr *
+__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+{
+ struct rpcrdma_mw *mw = seg->rl_mw;
+ struct rpcrdma_frmr *f = &mw->r.frmr;
+ struct ib_send_wr *invalidate_wr;
+
+ f->fr_waiter = false;
+ f->fr_state = FRMR_IS_INVALID;
+ invalidate_wr = &f->fr_invwr;
+
+ memset(invalidate_wr, 0, sizeof(*invalidate_wr));
+ invalidate_wr->wr_id = (unsigned long)(void *)mw;
+ invalidate_wr->opcode = IB_WR_LOCAL_INV;
+ invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey;
+
+ return invalidate_wr;
+}
+
+static void
+__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+ int rc)
+{
+ struct ib_device *device = r_xprt->rx_ia.ri_device;
+ struct rpcrdma_mw *mw = seg->rl_mw;
+ struct rpcrdma_frmr *f = &mw->r.frmr;
+
+ seg->rl_mw = NULL;
+
+ ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
+
+ if (!rc)
+ rpcrdma_put_mw(r_xprt, mw);
+ else
+ __frwr_queue_recovery(mw);
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ */
+static void
+frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_mr_seg *seg;
+ unsigned int i, nchunks;
+ struct rpcrdma_frmr *f;
+ int rc;
+
+ dprintk("RPC: %s: req %p\n", __func__, req);
+
+ /* ORDER: Invalidate all of the req's MRs first
+ *
+ * Chain the LOCAL_INV Work Requests and post them with
+ * a single ib_post_send() call.
+ */
+ invalidate_wrs = pos = prev = NULL;
+ seg = NULL;
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+
+ pos = __frwr_prepare_linv_wr(seg);
+
+ if (!invalidate_wrs)
+ invalidate_wrs = pos;
+ else
+ prev->next = pos;
+ prev = pos;
+
+ i += seg->mr_nsegs;
+ }
+ f = &seg->rl_mw->r.frmr;
+
+ /* Strong send queue ordering guarantees that when the
+ * last WR in the chain completes, all WRs in the chain
+ * are complete.
+ */
+ f->fr_invwr.send_flags = IB_SEND_SIGNALED;
+ f->fr_waiter = true;
+ init_completion(&f->fr_linv_done);
+ INIT_CQCOUNT(&r_xprt->rx_ep);
+
+ /* Transport disconnect drains the receive CQ before it
+ * replaces the QP. The RPC reply handler won't call us
+ * unless ri_id->qp is a valid pointer.
+ */
+ rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
+ if (rc)
+ pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+
+ wait_for_completion(&f->fr_linv_done);
+
+ /* ORDER: Now DMA unmap all of the req's MRs, and return
+ * them to the free MW list.
+ */
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+
+ __frwr_dma_unmap(r_xprt, seg, rc);
+
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ }
+
+ req->rl_nchunks = 0;
+}
+
/* Post a LOCAL_INV Work Request to prevent further remote access
* via RDMA READ or RDMA WRITE.
*/
@@ -423,23 +551,24 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_mw *mw = seg1->rl_mw;
struct rpcrdma_frmr *frmr = &mw->r.frmr;
- struct ib_send_wr invalidate_wr, *bad_wr;
+ struct ib_send_wr *invalidate_wr, *bad_wr;
int rc, nsegs = seg->mr_nsegs;
dprintk("RPC: %s: FRMR %p\n", __func__, mw);
seg1->rl_mw = NULL;
frmr->fr_state = FRMR_IS_INVALID;
+ invalidate_wr = &mw->r.frmr.fr_invwr;
- memset(&invalidate_wr, 0, sizeof(invalidate_wr));
- invalidate_wr.wr_id = (unsigned long)(void *)mw;
- invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey;
+ memset(invalidate_wr, 0, sizeof(*invalidate_wr));
+ invalidate_wr->wr_id = (uintptr_t)mw;
+ invalidate_wr->opcode = IB_WR_LOCAL_INV;
+ invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
read_lock(&ia->ri_qplock);
- rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
read_unlock(&ia->ri_qplock);
if (rc)
goto out_err;
@@ -471,6 +600,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
+ .ro_unmap_sync = frwr_op_unmap_sync,
.ro_unmap = frwr_op_unmap,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 617b76f22154..dbb302ecf590 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -83,6 +83,18 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
return 1;
}
+/* DMA unmap all memory regions that were mapped for "req".
+ */
+static void
+physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct ib_device *device = r_xprt->rx_ia.ri_device;
+ unsigned int i;
+
+ for (i = 0; req->rl_nchunks; --req->rl_nchunks)
+ rpcrdma_unmap_one(device, &req->rl_segments[i++]);
+}
+
static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -90,6 +102,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_map = physical_op_map,
+ .ro_unmap_sync = physical_op_unmap_sync,
.ro_unmap = physical_op_unmap,
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c10d9699441c..0f28f2d743ed 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -804,6 +804,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (req->rl_reply)
goto out_duplicate;
+ /* Sanity checking has passed. We are now committed
+ * to complete this transaction.
+ */
+ list_del_init(&rqst->rq_list);
+ spin_unlock_bh(&xprt->transport_lock);
dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"
" RPC request 0x%p xid 0x%08x\n",
__func__, rep, req, rqst,
@@ -888,12 +893,23 @@ badheader:
break;
}
+ /* Invalidate and flush the data payloads before waking the
+ * waiting application. This guarantees the memory region is
+ * properly fenced from the server before the application
+ * accesses the data. It also ensures proper send flow
+ * control: waking the next RPC waits until this RPC has
+ * relinquished all its Send Queue entries.
+ */
+ if (req->rl_nchunks)
+ r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
+
credits = be32_to_cpu(headerp->rm_credit);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_buf.rb_max_requests)
credits = r_xprt->rx_buf.rb_max_requests;
+ spin_lock_bh(&xprt->transport_lock);
cwnd = xprt->cwnd;
xprt->cwnd = credits << RPC_CWNDSHIFT;
if (xprt->cwnd > cwnd)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8c545f7d7525..740bddcf3488 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -576,6 +576,9 @@ xprt_rdma_free(void *buffer)
rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
req = rb->rg_owner;
+ if (req->rl_backchannel)
+ return;
+
r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index eadd1655145a..732c71ce5dca 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -616,10 +616,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* set trigger for requesting send completion */
ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
- if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
- ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
- else if (ep->rep_cqinit <= 2)
- ep->rep_cqinit = 0;
+ if (ep->rep_cqinit <= 2)
+ ep->rep_cqinit = 0; /* always signal? */
INIT_CQCOUNT(ep);
init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
@@ -852,10 +850,11 @@ retry:
if (extras) {
rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
- if (rc)
+ if (rc) {
pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
__func__, rc);
rc = 0;
+ }
}
}
@@ -1337,15 +1336,14 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_ep *ep = &r_xprt->rx_ep;
struct rpcrdma_rep *rep;
- unsigned long flags;
int rc;
while (count--) {
- spin_lock_irqsave(&buffers->rb_lock, flags);
+ spin_lock(&buffers->rb_lock);
if (list_empty(&buffers->rb_recv_bufs))
goto out_reqbuf;
rep = rpcrdma_buffer_get_rep_locked(buffers);
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ spin_unlock(&buffers->rb_lock);
rc = rpcrdma_ep_post_recv(ia, ep, rep);
if (rc)
@@ -1355,7 +1353,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
return 0;
out_reqbuf:
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ spin_unlock(&buffers->rb_lock);
pr_warn("%s: no extra receive buffers\n", __func__);
return -ENOMEM;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index ac7f8d4f632a..728101ddc44b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -88,12 +88,6 @@ struct rpcrdma_ep {
struct delayed_work rep_connect_worker;
};
-/*
- * Force a signaled SEND Work Request every so often,
- * in case the provider needs to do some housekeeping.
- */
-#define RPCRDMA_MAX_UNSIGNALED_SENDS (32)
-
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
@@ -207,6 +201,12 @@ struct rpcrdma_frmr {
enum rpcrdma_frmr_state fr_state;
struct work_struct fr_work;
struct rpcrdma_xprt *fr_xprt;
+ bool fr_waiter;
+ struct completion fr_linv_done;;
+ union {
+ struct ib_reg_wr fr_regwr;
+ struct ib_send_wr fr_invwr;
+ };
};
struct rpcrdma_fmr {
@@ -364,6 +364,8 @@ struct rpcrdma_xprt;
struct rpcrdma_memreg_ops {
int (*ro_map)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *, int, bool);
+ void (*ro_unmap_sync)(struct rpcrdma_xprt *,
+ struct rpcrdma_req *);
int (*ro_unmap)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *);
int (*ro_open)(struct rpcrdma_ia *,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2ffaf6a79499..70c13d675dc1 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1907,18 +1907,6 @@ static inline void xs_reclassify_socket(int family, struct socket *sock)
}
}
#else
-static inline void xs_reclassify_socketu(struct socket *sock)
-{
-}
-
-static inline void xs_reclassify_socket4(struct socket *sock)
-{
-}
-
-static inline void xs_reclassify_socket6(struct socket *sock)
-{
-}
-
static inline void xs_reclassify_socket(int family, struct socket *sock)
{
}
@@ -2008,7 +1996,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
"transport socket (%d).\n", -status);
goto out;
}
- xs_reclassify_socketu(sock);
+ xs_reclassify_socket(AF_LOCAL, sock);
dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n",
xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);