From 42d42a5b0cd263757f8e519debbc744fdaefdaf7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 23 May 2016 09:24:55 -0400 Subject: SUNRPC: Small optimisation of client receive Do not queue the client receive work if we're still processing. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 0ece4ba06f06..bef3fb0abb8f 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -80,6 +80,7 @@ struct sock_xprt { #define TCP_RPC_REPLY (1UL << 6) #define XPRT_SOCK_CONNECTING 1U +#define XPRT_SOCK_DATA_READY (2) #endif /* __KERNEL__ */ -- cgit v1.2.3 From 40a5f1b19bacb2de7a051be952dee85e38c9e5f5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 27 May 2016 10:39:50 -0400 Subject: SUNRPC: RPC transport queue must be low latency rpciod can easily get congested due to the long list of queued rpc_tasks. Having the receive queue wait in turn for those tasks to complete can therefore be a bottleneck. Address the problem by separating the workqueues into: - rpciod: manages rpc_tasks - xprtiod: manages transport related work. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 1 + net/sunrpc/sched.c | 24 ++++++++++++++++++++---- net/sunrpc/xprt.c | 8 ++++---- net/sunrpc/xprtsock.c | 6 +++--- 4 files changed, 28 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 05a1809c44d9..ef780b3b5e31 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -247,6 +247,7 @@ void rpc_show_tasks(struct net *); int rpc_init_mempool(void); void rpc_destroy_mempool(void); extern struct workqueue_struct *rpciod_workqueue; +extern struct workqueue_struct *xprtiod_workqueue; void rpc_prepare_task(struct rpc_task *task); static inline int rpc_wait_for_completion_task(struct rpc_task *task) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index fcfd48d263f6..a9f786247ffb 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue; /* * rpciod-related stuff */ -struct workqueue_struct *rpciod_workqueue; +struct workqueue_struct *rpciod_workqueue __read_mostly; +struct workqueue_struct *xprtiod_workqueue __read_mostly; /* * Disable the timer for a given RPC task. Should be called with @@ -1071,10 +1072,22 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - /* Note: highpri because network receive is latency sensitive */ - wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); + if (!wq) + goto out_failed; rpciod_workqueue = wq; - return rpciod_workqueue != NULL; + /* Note: highpri because network receive is latency sensitive */ + wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!wq) + goto free_rpciod; + xprtiod_workqueue = wq; + return 1; +free_rpciod: + wq = rpciod_workqueue; + rpciod_workqueue = NULL; + destroy_workqueue(wq); +out_failed: + return 0; } static void rpciod_stop(void) @@ -1088,6 +1101,9 @@ static void rpciod_stop(void) wq = rpciod_workqueue; rpciod_workqueue = NULL; destroy_workqueue(wq); + wq = xprtiod_workqueue; + xprtiod_workqueue = NULL; + destroy_workqueue(wq); } void diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 216a1385718a..71df082b84a9 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_atomic(); } else - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); } /* @@ -645,7 +645,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) set_bit(XPRT_CLOSE_WAIT, &xprt->state); /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -672,7 +672,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) set_bit(XPRT_CLOSE_WAIT, &xprt->state); /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); xprt_wake_pending_tasks(xprt, -EAGAIN); out: spin_unlock_bh(&xprt->transport_lock); @@ -689,7 +689,7 @@ xprt_init_autodisconnect(unsigned long data) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); return; out_abort: spin_unlock(&xprt->transport_lock); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 62b4f5a2a331..646170d0cb86 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1095,7 +1095,7 @@ static void xs_data_ready(struct sock *sk) if (xprt->reestablish_timeout) xprt->reestablish_timeout = 0; if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - queue_work(rpciod_workqueue, &transport->recv_worker); + queue_work(xprtiod_workqueue, &transport->recv_worker); } read_unlock_bh(&sk->sk_callback_lock); } @@ -2378,7 +2378,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) /* Start by resetting any existing state */ xs_reset_transport(transport); - queue_delayed_work(rpciod_workqueue, + queue_delayed_work(xprtiod_workqueue, &transport->connect_worker, xprt->reestablish_timeout); xprt->reestablish_timeout <<= 1; @@ -2388,7 +2388,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; } else { dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - queue_delayed_work(rpciod_workqueue, + queue_delayed_work(xprtiod_workqueue, &transport->connect_worker, 0); } } -- cgit v1.2.3 From f1dc237c60a5fdecc83062a28a702193f881cb19 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 27 May 2016 12:59:33 -0400 Subject: SUNRPC: Reduce latency when send queue is congested Use the low latency transport workqueue to process the task that is next in line on the xprt->sending queue. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 4 ++++ net/sunrpc/sched.c | 43 +++++++++++++++++++++++++++++++++---------- net/sunrpc/xprt.c | 6 ++++-- 3 files changed, 41 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ef780b3b5e31..817af0b4385e 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -230,6 +230,10 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *, struct rpc_task *); void rpc_wake_up(struct rpc_wait_queue *); struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); +struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *, + bool (*)(struct rpc_task *, void *), + void *); struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, bool (*)(struct rpc_task *, void *), void *); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index a9f786247ffb..9ae588511aaf 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -330,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); * lockless RPC_IS_QUEUED() test) before we've had a chance to test * the RPC_TASK_RUNNING flag. */ -static void rpc_make_runnable(struct rpc_task *task) +static void rpc_make_runnable(struct workqueue_struct *wq, + struct rpc_task *task) { bool need_wakeup = !rpc_test_and_set_running(task); @@ -339,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task) return; if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); - queue_work(rpciod_workqueue, &task->u.tk_work); + queue_work(wq, &task->u.tk_work); } else wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); } @@ -408,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); /** - * __rpc_do_wake_up_task - wake up a single rpc_task + * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task + * @wq: workqueue on which to run task * @queue: wait queue * @task: task to be woken up * * Caller must hold queue->lock, and have cleared the task queued flag. */ -static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) +static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, + struct rpc_task *task) { dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", task->tk_pid, jiffies); @@ -429,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task __rpc_remove_wait_queue(queue, task); - rpc_make_runnable(task); + rpc_make_runnable(wq, task); dprintk("RPC: __rpc_wake_up_task done\n"); } @@ -437,15 +441,24 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task) { if (RPC_IS_QUEUED(task)) { smp_rmb(); if (task->tk_waitqueue == queue) - __rpc_do_wake_up_task(queue, task); + __rpc_do_wake_up_task_on_wq(wq, queue, task); } } +/* + * Wake up a queued task while the queue lock is being held + */ +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); +} + /* * Wake up a task on a specific queue */ @@ -519,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue) /* * Wake up the first task on the wait queue. */ -struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, +struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, bool (*func)(struct rpc_task *, void *), void *data) { struct rpc_task *task = NULL; @@ -530,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, task = __rpc_find_next_queued(queue); if (task != NULL) { if (func(task, data)) - rpc_wake_up_task_queue_locked(queue, task); + rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); else task = NULL; } @@ -538,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, return task; } + +/* + * Wake up the first task on the wait queue. + */ +struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, + bool (*func)(struct rpc_task *, void *), void *data) +{ + return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data); +} EXPORT_SYMBOL_GPL(rpc_wake_up_first); static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) @@ -815,7 +838,7 @@ void rpc_execute(struct rpc_task *task) bool is_async = RPC_IS_ASYNC(task); rpc_set_active(task); - rpc_make_runnable(task); + rpc_make_runnable(rpciod_workqueue, task); if (!is_async) __rpc_execute(task); } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 71df082b84a9..8313960cac52 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt)) + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, + __xprt_lock_write_func, xprt)) return; xprt_clear_locked(xprt); } @@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) return; if (RPCXPRT_CONGESTED(xprt)) goto out_unlock; - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt)) + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, + __xprt_lock_write_cong_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); -- cgit v1.2.3 From 6b56a89833fa7903595c8d138bb4927187315cba Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Jun 2016 18:23:01 -0400 Subject: NFS: Kill NFS_INO_NFS_INO_FLUSHING: it is a performance killer filemap_datawrite() and friends already deal just fine with livelock. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 8 -------- fs/nfs/nfstrace.h | 1 - fs/nfs/write.c | 11 ----------- include/linux/nfs_fs.h | 1 - 4 files changed, 21 deletions(-) (limited to 'include') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2d39d9f9da7d..29d7477a62e8 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -359,14 +359,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - /* - * Prevent starvation issues if someone is doing a consistency - * sync-to-disk - */ - ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - return ret; /* * Wait for O_DIRECT to complete */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 0b9e5cc9a747..fe80a1c26340 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -37,7 +37,6 @@ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ { 1 << NFS_INO_STALE, "STALE" }, \ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ - { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e1c74d3db64d..980d44f3a84c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; - /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (err) - goto out_err; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, @@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); - clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_atomic(); - wake_up_bit(bitlock, NFS_INO_FLUSHING); - if (err < 0) goto out_err; err = pgio.pg_error; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index d71278c3c5bd..120dd04b553c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -205,7 +205,6 @@ struct nfs_inode { #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ -#define NFS_INO_FLUSHING (4) /* inode is flushing out data */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ -- cgit v1.2.3 From 5c6e5b60aae4347223f176966455010a5715b863 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 Jun 2016 14:13:12 -0400 Subject: NFS: Fix an Oops in the pNFS files and flexfiles connection setup to the DS Chris Worley reports: RIP: 0010:[] [] rpc_new_client+0x2a0/0x2e0 [sunrpc] RSP: 0018:ffff880158f6f548 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880234f8bc00 RCX: 000000000000ea60 RDX: 0000000000074cc0 RSI: 000000000000ea60 RDI: ffff880234f8bcf0 RBP: ffff880158f6f588 R08: 000000000001ac80 R09: ffff880237003300 R10: ffff880201171000 R11: ffffea0000d75200 R12: ffffffffa03afc60 R13: ffff880230c18800 R14: 0000000000000000 R15: ffff880158f6f680 FS: 00007f0e32673740(0000) GS:ffff88023fc40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 0000000234886000 CR4: 00000000001406e0 Stack: ffffffffa047a680 0000000000000000 ffff880158f6f598 ffff880158f6f680 ffff880158f6f680 ffff880234d11d00 ffff88023357f800 ffff880158f6f7d0 ffff880158f6f5b8 ffffffffa024660a ffff880158f6f5b8 ffffffffa02492ec Call Trace: [] rpc_create_xprt+0x1a/0xb0 [sunrpc] [] ? xprt_create_transport+0x13c/0x240 [sunrpc] [] rpc_create+0xc6/0x1a0 [sunrpc] [] nfs_create_rpc_client+0xf5/0x140 [nfs] [] nfs_init_client+0x3a/0xd0 [nfs] [] nfs_get_client+0x25f/0x310 [nfs] [] ? rpc_ntop+0xe8/0x100 [sunrpc] [] nfs3_set_ds_client+0xcc/0x100 [nfsv3] [] nfs4_pnfs_ds_connect+0x120/0x400 [nfsv4] [] nfs4_ff_layout_prepare_ds+0xe7/0x330 [nfs_layout_flexfiles] [] ff_layout_pg_init_write+0xcb/0x280 [nfs_layout_flexfiles] [] __nfs_pageio_add_request+0x12c/0x490 [nfs] [] nfs_pageio_add_request+0xc2/0x2a0 [nfs] [] ? nfs_pageio_init+0x75/0x120 [nfs] [] nfs_do_writepage+0x120/0x270 [nfs] [] nfs_writepage_locked+0x61/0xc0 [nfs] [] ? __percpu_counter_add+0x55/0x70 [] nfs_wb_single_page+0xef/0x1c0 [nfs] [] ? __dec_zone_page_state+0x33/0x40 [] nfs_launder_page+0x41/0x90 [nfs] [] invalidate_inode_pages2_range+0x340/0x3a0 [] invalidate_inode_pages2+0x17/0x20 [] nfs_release+0x9e/0xb0 [nfs] [] ? nfs_open+0x60/0x60 [nfs] [] nfs_file_release+0x3d/0x60 [nfs] [] __fput+0xdc/0x1e0 [] ____fput+0xe/0x10 [] task_work_run+0xc4/0xe0 [] do_exit+0x2e8/0xb30 [] ? do_audit_syscall_entry+0x6c/0x70 [] ? __audit_syscall_exit+0x1e6/0x280 [] do_group_exit+0x3f/0xa0 [] SyS_exit_group+0x14/0x20 [] system_call_fastpath+0x12/0x71 Which seems to be due to a call to utsname() when in a task exit context in order to determine the hostname to set in rpc_new_client(). In reality, what we want here is not the hostname of the current task, but the hostname that was used to set up the metadata server. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 22 ++++++++++------------ fs/nfs/internal.h | 16 ++++++++-------- fs/nfs/nfs3client.c | 8 +++++--- fs/nfs/nfs4client.c | 20 ++++++++++++-------- include/linux/nfs_xdr.h | 5 ++--- 5 files changed, 37 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0c96528db94a..4849d0f778dc 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, */ struct nfs_client * nfs_get_client(const struct nfs_client_initdata *cl_init, - const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour) { struct nfs_client *clp, *new = NULL; @@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new->cl_flags = cl_init->init_flags; - return rpc_ops->init_client(new, timeparms, ip_addr); + return rpc_ops->init_client(new, cl_init); } spin_unlock(&nn->nfs_client_lock); @@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values); * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, + const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct rpc_clnt *clnt = NULL; @@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp, .protocol = clp->cl_proto, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, - .timeout = timeparms, + .timeout = cl_init->timeparms, .servername = clp->cl_hostname, + .nodename = cl_init->nodename, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, @@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: IP presentation address (not used) + * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { int error; @@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_parsed_mount_data *data, struct nfs_subversion *nfs_mod) { + struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = data->nfs_server.hostname, .addr = (const struct sockaddr *)&data->nfs_server.address, @@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server, .nfs_mod = nfs_mod, .proto = data->nfs_server.protocol, .net = data->net, + .timeparms = &timeparms, }; - struct rpc_timeout timeparms; struct nfs_client *clp; int error; @@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server, set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); + clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5154fa65a2f2..fa88609f85e3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -66,13 +66,16 @@ struct nfs_clone_mount { struct nfs_client_initdata { unsigned long init_flags; - const char *hostname; - const struct sockaddr *addr; + const char *hostname; /* Hostname of the server */ + const struct sockaddr *addr; /* Address of the server */ + const char *nodename; /* Hostname of the client */ + const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; struct net *net; + const struct rpc_timeout *timeparms; }; /* @@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info); extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); -int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); +int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, - const struct rpc_timeout *, const char *, rpc_authflavor_t); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); @@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); /* dir.c */ extern void nfs_force_use_readdirplus(struct inode *dir); @@ -521,8 +522,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 9e9fa347a948..0457b4129421 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -81,14 +81,17 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v3, .proto = ds_proto, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -99,8 +102,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); return clp; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 10410e8b5853..5fc7fbbfdcef 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { char buf[INET6_ADDRSTRLEN + 1]; + const char *ip_addr = cl_init->ip_addr; struct nfs_client *old; int error; @@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; @@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server, .hostname = hostname, .addr = addr, .addrlen = addrlen, + .ip_addr = ip_addr, .nfs_mod = &nfs_v4, .proto = proto, .minorversion = minorversion, .net = net, + .timeparms = timeparms, }; struct nfs_client *clp; int error; @@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); + clp = nfs_get_client(&cl_init, authflavour); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; @@ -847,15 +849,18 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v4, .proto = ds_proto, .minorversion = minor_version, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -869,8 +874,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); dprintk("<-- %s %p\n", __func__, clp); return clp; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c304a11b5b1a..82b81a1c2438 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1596,9 +1596,8 @@ struct nfs_rpc_ops { int (*have_delegation)(struct inode *, fmode_t); int (*return_delegation)(struct inode *); struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); - struct nfs_client * - (*init_client) (struct nfs_client *, const struct rpc_timeout *, - const char *); + struct nfs_client *(*init_client) (struct nfs_client *, + const struct nfs_client_initdata *); void (*free_client) (struct nfs_client *); struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *); struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, -- cgit v1.2.3 From a5864c999de6703f7ce908f72337568520c6cad3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Jun 2016 17:07:19 -0400 Subject: NFS: Do not serialise O_DIRECT reads and writes Allow dio requests to be scheduled in parallel, but ensuring that they do not conflict with buffered I/O. Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 2 +- fs/nfs/direct.c | 41 +++----------- fs/nfs/file.c | 12 ++-- fs/nfs/internal.h | 8 +++ fs/nfs/io.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nfs_fs.h | 1 + 6 files changed, 174 insertions(+), 37 deletions(-) create mode 100644 fs/nfs/io.c (limited to 'include') diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8664417955a2..6abdda209642 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ - direct.o pagelist.o read.o symlink.o unlink.o \ + io.o direct.o pagelist.o read.o symlink.o unlink.o \ write.o namespace.o mount_clnt.o nfstrace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0169eca8eb42..6d0e88096440 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -578,17 +578,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!count) goto out; - inode_lock(inode); - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - task_io_account_read(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) - goto out_unlock; + goto out; dreq->inode = inode; dreq->bytes_left = dreq->max_count = count; @@ -603,10 +598,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -614,13 +611,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += result; } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); out: return result; } @@ -1008,25 +1000,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; - inode_lock(inode); - - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - - if (mapping->nrpages) { - result = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - if (result) - goto out_unlock; - } - task_io_account_write(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (!dreq) - goto out_unlock; + goto out; dreq->inode = inode; dreq->bytes_left = dreq->max_count = count; @@ -1041,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { @@ -1048,7 +1029,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_SHIFT, end); } - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -1058,13 +1039,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) generic_write_sync(iocb, result); } } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); +out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 46cf0afe3c0f..9f8da9e1b23f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); + nfs_start_io_read(inode); + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); } + nfs_end_io_read(inode); return result; } EXPORT_SYMBOL_GPL(nfs_file_read); @@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); + nfs_start_io_read(inode); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); } + nfs_end_io_read(inode); return res; } EXPORT_SYMBOL_GPL(nfs_file_splice_read); @@ -645,14 +649,14 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - inode_lock(inode); + nfs_start_io_write(inode); result = generic_write_checks(iocb, from); if (result > 0) { current->backing_dev_info = inode_to_bdi(inode); result = generic_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; } - inode_unlock(inode); + nfs_end_io_write(inode); if (result <= 0) goto out; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 0eb5c924886d..159b64ede82a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -411,6 +411,14 @@ extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); +/* io.c */ +extern void nfs_start_io_read(struct inode *inode); +extern void nfs_end_io_read(struct inode *inode); +extern void nfs_start_io_write(struct inode *inode); +extern void nfs_end_io_write(struct inode *inode); +extern void nfs_start_io_direct(struct inode *inode); +extern void nfs_end_io_direct(struct inode *inode); + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, diff --git a/fs/nfs/io.c b/fs/nfs/io.c new file mode 100644 index 000000000000..1fc5d1ce327e --- /dev/null +++ b/fs/nfs/io.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 Trond Myklebust + * + * I/O and data path helper functionality. + */ + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(inode); + } +} + +/** + * nfs_start_io_read - declare the file is being used for buffered reads + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +nfs_start_io_read(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_o_direct(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_read - declare that the buffered read operation is done + * @inode - file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * nfs_start_io_write - declare the file is being used for buffered writes + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +void +nfs_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + nfs_block_o_direct(NFS_I(inode), inode); +} + +/** + * nfs_end_io_write - declare that the buffered write operation is done + * @inode - file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +nfs_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) +{ + if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + set_bit(NFS_INO_ODIRECT, &nfsi->flags); + nfs_wb_all(inode); + } +} + +/** + * nfs_end_io_direct - declare the file is being used for direct i/o + * @inode - file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +nfs_start_io_direct(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_buffered(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_direct - declare that the direct i/o operation is done + * @inode - file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 120dd04b553c..225d17d35277 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -210,6 +210,7 @@ struct nfs_inode { #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ +#define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { -- cgit v1.2.3 From be527494e02b89e03485955b30de6c1e976a07eb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 Jun 2016 08:19:36 -0400 Subject: NFS: Remove unused function nfs_revalidate_mapping_protected() Clean up... Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 38 ++++---------------------------------- include/linux/nfs_fs.h | 1 - 2 files changed, 4 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6c0618eb5d57..0e0500f2bb6b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1131,14 +1131,12 @@ out: } /** - * __nfs_revalidate_mapping - Revalidate the pagecache + * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping - * @may_lock - take inode->i_mutex? */ -static int __nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping, - bool may_lock) +int nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1187,12 +1185,7 @@ static int __nfs_revalidate_mapping(struct inode *inode, nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - if (may_lock) { - inode_lock(inode); - ret = nfs_invalidate_mapping(inode, mapping); - inode_unlock(inode); - } else - ret = nfs_invalidate_mapping(inode, mapping); + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1202,29 +1195,6 @@ out: return ret; } -/** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) -{ - return __nfs_revalidate_mapping(inode, mapping, false); -} - -/** - * nfs_revalidate_mapping_protected - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - * - * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex - * while invalidating the mapping. - */ -int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) -{ - return __nfs_revalidate_mapping(inode, mapping, true); -} - static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 225d17d35277..810124b33327 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -351,7 +351,6 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); -extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping); extern int nfs_setattr(struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, -- cgit v1.2.3 From 65b80179f9b8171b74625febf3457f41e792fa23 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 29 Jun 2016 13:55:06 -0400 Subject: xprtrdma: No direct data placement with krb5i and krb5p Direct data placement is not allowed when using flavors that guarantee integrity or privacy. When such security flavors are in effect, don't allow the use of Read and Write chunks for moving individual data items. All messages larger than the inline threshold are sent via Long Call or Long Reply. On my systems (CX-3 Pro on FDR), for small I/O operations, the use of Long messages adds only around 5 usecs of latency in each direction. Note that when integrity or encryption is used, the host CPU touches every byte in these messages. Even if it could be used, data movement offload doesn't buy much in this case. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- include/linux/sunrpc/auth.h | 3 +++ include/linux/sunrpc/gss_api.h | 2 ++ net/sunrpc/auth_gss/auth_gss.c | 2 ++ net/sunrpc/auth_gss/gss_krb5_mech.c | 2 ++ net/sunrpc/auth_gss/gss_mech_switch.c | 12 ++++++++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 12 ++++++++++-- 6 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 899791573a40..3a40287b4d27 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -107,6 +107,9 @@ struct rpc_auth { /* per-flavor data */ }; +/* rpc_auth au_flags */ +#define RPCAUTH_AUTH_DATATOUCH 0x00000002 + struct rpc_auth_create_args { rpc_authflavor_t pseudoflavor; const char *target_name; diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index 1f911ccb2a75..68ec78c1aa48 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h @@ -73,6 +73,7 @@ u32 gss_delete_sec_context( rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop, u32 service); u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor); +bool gss_pseudoflavor_to_datatouch(struct gss_api_mech *, u32 pseudoflavor); char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service); struct pf_desc { @@ -81,6 +82,7 @@ struct pf_desc { u32 service; char *name; char *auth_domain_name; + bool datatouch; }; /* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index e64ae93d5b4f..bca3537efffd 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1017,6 +1017,8 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth->au_rslack = GSS_VERF_SLACK >> 2; auth->au_ops = &authgss_ops; auth->au_flavor = flavor; + if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) + auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; atomic_set(&auth->au_count, 1); kref_init(&gss_auth->kref); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 65427492b1c9..60595835317a 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = { .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_INTEGRITY, .name = "krb5i", + .datatouch = true, }, [2] = { .pseudoflavor = RPC_AUTH_GSS_KRB5P, .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_PRIVACY, .name = "krb5p", + .datatouch = true, }, }; diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 7063d856a598..5fec3abbe19b 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) } EXPORT_SYMBOL(gss_pseudoflavor_to_service); +bool +gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return gm->gm_pfs[i].datatouch; + } + return false; +} + char * gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) { diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index dac2990ae2f7..a47f170b20ef 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -570,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; + bool ddp_allowed; ssize_t hdrlen; size_t rpclen; __be32 *iptr; @@ -586,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); headerp->rm_type = rdma_msg; + /* When the ULP employs a GSS flavor that guarantees integrity + * or privacy, direct data placement of individual data items + * is not allowed. + */ + ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & + RPCAUTH_AUTH_DATATOUCH); + /* * Chunks needed for results? * @@ -597,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rpcrdma_results_inline(r_xprt, rqst)) wtype = rpcrdma_noch; - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) wtype = rpcrdma_writech; else wtype = rpcrdma_replych; @@ -620,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) rtype = rpcrdma_noch; rpcrdma_inline_pullup(rqst); rpclen = rqst->rq_svec[0].iov_len; - } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; rpclen = rqst->rq_svec[0].iov_len; rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); -- cgit v1.2.3 From ce52914eb76efd62aa48d738cf845b37852bf920 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 7 Jun 2016 15:14:48 -0400 Subject: sunrpc: move NO_CRKEY_TIMEOUT to the auth->au_flags A generic_cred can be used to look up a unx_cred or a gss_cred, so it's not really safe to use the the generic_cred->acred->ac_flags to store the NO_CRKEY_TIMEOUT flag. A lookup for a unx_cred triggered while the KEY_EXPIRE_SOON flag is already set will cause both NO_CRKEY_TIMEOUT and KEY_EXPIRE_SOON to be set in the ac_flags, leaving the user associated with the auth_cred to be in a state where they're perpetually doing 4K NFS_FILE_SYNC writes. This can be reproduced as follows: 1. Mount two NFS filesystems, one with sec=krb5 and one with sec=sys. They do not need to be the same export, nor do they even need to be from the same NFS server. Also, v3 is fine. $ sudo mount -o v3,sec=krb5 server1:/export /mnt/krb5 $ sudo mount -o v3,sec=sys server2:/export /mnt/sys 2. As the normal user, before accessing the kerberized mount, kinit with a short lifetime (but not so short that renewing the ticket would leave you within the 4-minute window again by the time the original ticket expires), e.g. $ kinit -l 10m -r 60m 3. Do some I/O to the kerberized mount and verify that the writes are wsize, UNSTABLE: $ dd if=/dev/zero of=/mnt/krb5/file bs=1M count=1 4. Wait until you're within 4 minutes of key expiry, then do some more I/O to the kerberized mount to ensure that RPC_CRED_KEY_EXPIRE_SOON gets set. Verify that the writes are 4K, FILE_SYNC: $ dd if=/dev/zero of=/mnt/krb5/file bs=1M count=1 5. Now do some I/O to the sec=sys mount. This will cause RPC_CRED_NO_CRKEY_TIMEOUT to be set: $ dd if=/dev/zero of=/mnt/sys/file bs=1M count=1 6. Writes for that user will now be permanently 4K, FILE_SYNC for that user, regardless of which mount is being written to, until you reboot the client. Renewing the kerberos ticket (assuming it hasn't already expired) will have no effect. Grabbing a new kerberos ticket at this point will have no effect either. Move the flag to the auth->au_flags field (which is currently unused) and rename it slightly to reflect that it's no longer associated with the auth_cred->ac_flags. Add the rpc_auth to the arg list of rpcauth_cred_key_to_expire and check the au_flags there too. Finally, add the inode to the arg list of nfs_ctx_key_to_expire so we can determine the rpc_auth to pass to rpcauth_cred_key_to_expire. Signed-off-by: Scott Mayhew Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 4 ++-- fs/nfs/internal.h | 2 +- fs/nfs/write.c | 6 ++++-- include/linux/sunrpc/auth.h | 6 ++++-- net/sunrpc/auth.c | 4 +++- net/sunrpc/auth_generic.c | 9 +-------- net/sunrpc/auth_gss/auth_gss.c | 1 + net/sunrpc/auth_null.c | 1 + net/sunrpc/auth_unix.c | 1 + 9 files changed, 18 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 717a8d6af52d..6bcd8913e8a9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -432,7 +432,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return status; NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx)) { + if (nfs_ctx_key_to_expire(ctx, mapping->host)) { status = nfs_wb_all(mapping->host); if (status < 0) return status; @@ -645,7 +645,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode) ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || - nfs_ctx_key_to_expire(ctx)) + nfs_ctx_key_to_expire(ctx, inode)) return 1; return 0; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index fa88609f85e3..d2260e67334f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -497,7 +497,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); #ifdef CONFIG_MIGRATION diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e1c74d3db64d..0b949a06b297 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1195,9 +1195,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode) /* * Test if the open context credential key is marked to expire soon. */ -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) { - return rpcauth_cred_key_to_expire(ctx->cred); + struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + + return rpcauth_cred_key_to_expire(auth, ctx->cred); } /* diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 899791573a40..f890a295a7ff 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -37,7 +37,6 @@ struct rpcsec_gss_info; /* auth_cred ac_flags bits */ enum { - RPC_CRED_NO_CRKEY_TIMEOUT = 0, /* underlying cred has no key timeout */ RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */ RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying key will expire soon */ @@ -82,6 +81,9 @@ struct rpc_cred { #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 +/* rpc_auth au_flags */ +#define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT 0x0001 /* underlying cred has no key timeout */ + /* * Client authentication handle */ @@ -196,7 +198,7 @@ void rpcauth_destroy_credcache(struct rpc_auth *); void rpcauth_clear_credcache(struct rpc_cred_cache *); int rpcauth_key_timeout_notify(struct rpc_auth *, struct rpc_cred *); -bool rpcauth_cred_key_to_expire(struct rpc_cred *); +bool rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *); char * rpcauth_stringify_acceptor(struct rpc_cred *); static inline diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 040ff627c18a..696eb39fc1cb 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -359,8 +359,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred) EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); bool -rpcauth_cred_key_to_expire(struct rpc_cred *cred) +rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) { + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) + return false; if (!cred->cr_ops->crkey_to_expire) return false; return cred->cr_ops->crkey_to_expire(cred); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 54dd3fdead54..168219535a34 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) /* Fast track for non crkey_timeout (no key) underlying credentials */ - if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) return 0; /* Fast track for the normal case */ @@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) if (IS_ERR(tcred)) return -EACCES; - if (!tcred->cr_ops->crkey_timeout) { - set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags); - ret = 0; - goto out_put; - } - /* Test for the almost error case */ ret = tcred->cr_ops->crkey_timeout(tcred); if (ret != 0) { @@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); } -out_put: put_rpccred(tcred); return ret; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index e64ae93d5b4f..813a3cdfb573 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1015,6 +1015,7 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_flags = 0; auth->au_ops = &authgss_ops; auth->au_flavor = flavor; atomic_set(&auth->au_count, 1); diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 8d9eb4d5ddd8..4d17376b2acb 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -115,6 +115,7 @@ static struct rpc_auth null_auth = { .au_cslack = NUL_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 9f65452b7cbc..a99278c984e8 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -228,6 +228,7 @@ static struct rpc_auth unix_auth = { .au_cslack = UNX_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = ATOMIC_INIT(0), -- cgit v1.2.3