From fe943d50425b6646606f8ef1ef8b8d4975fdbee2 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 12 Apr 2018 12:04:55 +0800 Subject: libceph, rbd: add error handling for osd_req_op_cls_init() Add proper error handling for osd_req_op_cls_init() to replace BUG_ON statement when failing from memory allocation. Signed-off-by: Chengguang Xu Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d2667e5dddc3..08b5fc1f90cc 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -767,7 +767,7 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_dup_last); -void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, +int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method) { struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, @@ -779,7 +779,9 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, BUG_ON(opcode != CEPH_OSD_OP_CALL); pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); - BUG_ON(!pagelist); + if (!pagelist) + return -ENOMEM; + ceph_pagelist_init(pagelist); op->cls.class_name = class; @@ -799,6 +801,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); op->indata_len = payload_len; + return 0; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -4928,7 +4931,10 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); + ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); + if (ret) + goto out_put_req; + if (req_page) osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len, 0, false, false); -- cgit v1.2.3 From d2935d6f758fa72290ed30bd7482675e04715b6f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 25 Apr 2018 12:17:13 +0200 Subject: libceph: get rid of more_kvec in try_write() All gotos to "more" are conditioned on con->state == OPEN, but the only thing "more" does is opening the socket if con->state == PREOPEN. Kill that label and rename "more_kvec" to "more". Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- net/ceph/messenger.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3b3d33ea9ed8..53d145637ed5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2575,9 +2575,6 @@ static int try_write(struct ceph_connection *con) con->state != CON_STATE_OPEN) return 0; -more: - dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); - /* open the socket first? */ if (con->state == CON_STATE_PREOPEN) { BUG_ON(con->sock); @@ -2598,7 +2595,8 @@ more: } } -more_kvec: +more: + dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); BUG_ON(!con->sock); /* kvec data queued? */ @@ -2623,7 +2621,7 @@ more_kvec: ret = write_partial_message_data(con); if (ret == 1) - goto more_kvec; /* we need to send the footer, too! */ + goto more; /* we need to send the footer, too! */ if (ret == 0) goto out; if (ret < 0) { @@ -2659,8 +2657,6 @@ out: return ret; } - - /* * Read what we can from the socket. */ -- cgit v1.2.3 From e5c9388399e012ed919ad5dac818a11ed1391b18 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 27 Apr 2018 18:58:47 +0200 Subject: libceph: use MSG_TRUNC for discarding received bytes Avoid a copy into the "skip buffer". Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 53d145637ed5..c6413c360771 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -168,12 +168,6 @@ static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; static struct lock_class_key socket_class; #endif -/* - * When skipping (ignoring) a block of input we read it into a "skip - * buffer," which is this many bytes in size. - */ -#define SKIP_BUF_SIZE 1024 - static void queue_con(struct ceph_connection *con); static void cancel_con(struct ceph_connection *con); static void ceph_con_workfn(struct work_struct *); @@ -520,12 +514,18 @@ static int ceph_tcp_connect(struct ceph_connection *con) return 0; } +/* + * If @buf is NULL, discard up to @len bytes. + */ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) { struct kvec iov = {buf, len}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; int r; + if (!buf) + msg.msg_flags |= MSG_TRUNC; + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) @@ -2717,16 +2717,11 @@ more: if (con->in_base_pos < 0) { /* * skipping + discarding content. - * - * FIXME: there must be a better way to do this! */ - static char buf[SKIP_BUF_SIZE]; - int skip = min((int) sizeof (buf), -con->in_base_pos); - - dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); - ret = ceph_tcp_recvmsg(con->sock, buf, skip); + ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos); if (ret <= 0) goto out; + dout("skipped %d / %d bytes\n", ret, -con->in_base_pos); con->in_base_pos += ret; if (con->in_base_pos) goto more; -- cgit v1.2.3 From 66850df58529eefc61cb96b895991508547503bf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 15 May 2018 15:47:58 +0200 Subject: libceph: introduce ceph_osdc_abort_requests() This will be used by the filesystem for "umount -f". Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 ++ net/ceph/osd_client.c | 67 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index b73dd7ebe585..874c31c01f80 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -347,6 +347,7 @@ struct ceph_osd_client { struct rb_root linger_map_checks; atomic_t num_requests; atomic_t num_homeless; + int abort_err; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; #ifdef CONFIG_DEBUG_FS @@ -378,6 +379,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); +void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); extern void osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 08b5fc1f90cc..a7e090d2c957 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1058,6 +1058,38 @@ EXPORT_SYMBOL(ceph_osdc_new_request); DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node) +/* + * Call @fn on each OSD request as long as @fn returns 0. + */ +static void for_each_request(struct ceph_osd_client *osdc, + int (*fn)(struct ceph_osd_request *req, void *arg), + void *arg) +{ + struct rb_node *n, *p; + + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + for (p = rb_first(&osd->o_requests); p; ) { + struct ceph_osd_request *req = + rb_entry(p, struct ceph_osd_request, r_node); + + p = rb_next(p); + if (fn(req, arg)) + return; + } + } + + for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) { + struct ceph_osd_request *req = + rb_entry(p, struct ceph_osd_request, r_node); + + p = rb_next(p); + if (fn(req, arg)) + return; + } +} + static bool osd_homeless(struct ceph_osd *osd) { return osd->o_osd == CEPH_HOMELESS_OSD; @@ -2165,9 +2197,9 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd *osd; enum calc_target_result ct_res; + int err = 0; bool need_send = false; bool promoted = false; - bool need_abort = false; WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); @@ -2183,7 +2215,10 @@ again: goto promote; } - if (osdc->osdmap->epoch < osdc->epoch_barrier) { + if (osdc->abort_err) { + dout("req %p abort_err %d\n", req, osdc->abort_err); + err = osdc->abort_err; + } else if (osdc->osdmap->epoch < osdc->epoch_barrier) { dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, osdc->epoch_barrier); req->r_t.paused = true; @@ -2208,7 +2243,7 @@ again: req->r_t.paused = true; maybe_request_map(osdc); if (req->r_abort_on_full) - need_abort = true; + err = -ENOSPC; } else if (!osd_homeless(osd)) { need_send = true; } else { @@ -2225,8 +2260,8 @@ again: link_request(osd, req); if (need_send) send_request(req); - else if (need_abort) - complete_request(req, -ENOSPC); + else if (err) + complete_request(req, err); mutex_unlock(&osd->lock); if (ct_res == CALC_TARGET_POOL_DNE) @@ -2340,6 +2375,28 @@ static void abort_request(struct ceph_osd_request *req, int err) complete_request(req, err); } +static int abort_fn(struct ceph_osd_request *req, void *arg) +{ + int err = *(int *)arg; + + abort_request(req, err); + return 0; /* continue iteration */ +} + +/* + * Abort all in-flight requests with @err and arrange for all future + * requests to be failed immediately. + */ +void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) +{ + dout("%s osdc %p err %d\n", __func__, osdc, err); + down_write(&osdc->lock); + for_each_request(osdc, abort_fn, &err); + osdc->abort_err = err; + up_write(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_abort_requests); + static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { -- cgit v1.2.3 From 0d09c57d0846537332d3649eef7e01960ffdc574 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 18 May 2018 19:34:45 +0200 Subject: libceph: no need to call flush_workqueue() before destruction destroy_workqueue() drains the workqueue before proceeding with destruction. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a7e090d2c957..bcedeea80cd5 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5081,7 +5081,6 @@ out: void ceph_osdc_stop(struct ceph_osd_client *osdc) { - flush_workqueue(osdc->notify_wq); destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); -- cgit v1.2.3 From 26df726bcdfacf69335de716e7b78c517bc1df65 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 21 May 2018 15:33:48 +0200 Subject: libceph: move more code into __complete_request() Move req->r_completion wake up and req->r_kref decrement into __complete_request(). Signed-off-by: Ilya Dryomov Acked-by: Jeff Layton Reviewed-by: "Yan, Zheng" --- net/ceph/osd_client.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index bcedeea80cd5..a78f578a2da7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2320,11 +2320,13 @@ static void finish_request(struct ceph_osd_request *req) static void __complete_request(struct ceph_osd_request *req) { - if (req->r_callback) { - dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, - req->r_tid, req->r_callback, req->r_result); + dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, + req->r_tid, req->r_callback, req->r_result); + + if (req->r_callback) req->r_callback(req); - } + complete_all(&req->r_completion); + ceph_osdc_put_request(req); } /* @@ -2337,8 +2339,6 @@ static void complete_request(struct ceph_osd_request *req, int err) req->r_result = err; finish_request(req); __complete_request(req); - complete_all(&req->r_completion); - ceph_osdc_put_request(req); } static void cancel_map_check(struct ceph_osd_request *req) @@ -3602,8 +3602,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) up_read(&osdc->lock); __complete_request(req); - complete_all(&req->r_completion); - ceph_osdc_put_request(req); return; fail_request: -- cgit v1.2.3 From 88bc1922c273c95e84a8955e657401f9bc63a80b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 21 May 2018 16:00:29 +0200 Subject: libceph: defer __complete_request() to a workqueue In the common case, req->r_callback is called by handle_reply() on the ceph-msgr worker thread without any locks. If handle_reply() fails, it is called with both osd->lock and osdc->lock. In the map check case, it is called with just osdc->lock but held for write. Finally, if the request is aborted because of -ENOSPC or by ceph_osdc_abort_requests(), it is called directly on the submitter's thread, again with both locks. req->r_callback on the submitter's thread is relatively new (introduced in 4.12) and ripe for deadlocks -- e.g. writeback worker thread waiting on itself: inode_wait_for_writeback+0x26/0x40 evict+0xb5/0x1a0 iput+0x1d2/0x220 ceph_put_wrbuffer_cap_refs+0xe0/0x2c0 [ceph] writepages_finish+0x2d3/0x410 [ceph] __complete_request+0x26/0x60 [libceph] complete_request+0x2e/0x70 [libceph] __submit_request+0x256/0x330 [libceph] submit_request+0x2b/0x30 [libceph] ceph_osdc_start_request+0x25/0x40 [libceph] ceph_writepages_start+0xdfe/0x1320 [ceph] do_writepages+0x1f/0x70 __writeback_single_inode+0x45/0x330 writeback_sb_inodes+0x26a/0x600 __writeback_inodes_wb+0x92/0xc0 wb_writeback+0x274/0x330 wb_workfn+0x2d5/0x3b0 Defer __complete_request() to a workqueue in all failure cases so it's never on the same thread as ceph_osdc_start_request() and always called with no locks held. Link: http://tracker.ceph.com/issues/23978 Signed-off-by: Ilya Dryomov Acked-by: Jeff Layton Reviewed-by: "Yan, Zheng" --- include/linux/ceph/osd_client.h | 2 ++ net/ceph/osd_client.c | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 874c31c01f80..d4191bde95a4 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -170,6 +170,7 @@ struct ceph_osd_request { u64 r_tid; /* unique for this client */ struct rb_node r_node; struct rb_node r_mc_node; /* map check */ + struct work_struct r_complete_work; struct ceph_osd *r_osd; struct ceph_osd_request_target r_t; @@ -360,6 +361,7 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op_reply; struct workqueue_struct *notify_wq; + struct workqueue_struct *completion_wq; }; static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a78f578a2da7..a4c12c37aa90 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2329,6 +2329,14 @@ static void __complete_request(struct ceph_osd_request *req) ceph_osdc_put_request(req); } +static void complete_request_workfn(struct work_struct *work) +{ + struct ceph_osd_request *req = + container_of(work, struct ceph_osd_request, r_complete_work); + + __complete_request(req); +} + /* * This is open-coded in handle_reply(). */ @@ -2338,7 +2346,9 @@ static void complete_request(struct ceph_osd_request *req, int err) req->r_result = err; finish_request(req); - __complete_request(req); + + INIT_WORK(&req->r_complete_work, complete_request_workfn); + queue_work(req->r_osdc->completion_wq, &req->r_complete_work); } static void cancel_map_check(struct ceph_osd_request *req) @@ -5058,6 +5068,10 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->notify_wq) goto out_msgpool_reply; + osdc->completion_wq = create_singlethread_workqueue("ceph-completion"); + if (!osdc->completion_wq) + goto out_notify_wq; + schedule_delayed_work(&osdc->timeout_work, osdc->client->options->osd_keepalive_timeout); schedule_delayed_work(&osdc->osds_timeout_work, @@ -5065,6 +5079,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) return 0; +out_notify_wq: + destroy_workqueue(osdc->notify_wq); out_msgpool_reply: ceph_msgpool_destroy(&osdc->msgpool_op_reply); out_msgpool: @@ -5079,6 +5095,7 @@ out: void ceph_osdc_stop(struct ceph_osd_client *osdc) { + destroy_workqueue(osdc->completion_wq); destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); -- cgit v1.2.3 From 4eea0fefd7e60552b36a74f49bd7064d1a5aef2d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 16 May 2018 18:21:34 +0200 Subject: libceph: use for_each_request() in ceph_osdc_abort_on_full() Scanning the trees just to see if there is anything to abort is unnecessary -- all that is needed here is to update the epoch barrier first, before we start aborting. Simplify and do the update inside the loop before calling abort_request() for the first time. The switch to for_each_request() also fixes a bug: homeless requests weren't even considered for aborting. Signed-off-by: Ilya Dryomov Acked-by: Jeff Layton Reviewed-by: "Yan, Zheng" --- net/ceph/osd_client.c | 79 +++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a4c12c37aa90..be274ab43d01 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2433,6 +2433,30 @@ void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) } EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); +/* + * We can end up releasing caps as a result of abort_request(). + * In that case, we probably want to ensure that the cap release message + * has an updated epoch barrier in it, so set the epoch barrier prior to + * aborting the first request. + */ +static int abort_on_full_fn(struct ceph_osd_request *req, void *arg) +{ + struct ceph_osd_client *osdc = req->r_osdc; + bool *victims = arg; + + if (req->r_abort_on_full && + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + pool_full(osdc, req->r_t.target_oloc.pool))) { + if (!*victims) { + update_epoch_barrier(osdc, osdc->osdmap->epoch); + *victims = true; + } + abort_request(req, -ENOSPC); + } + + return 0; /* continue iteration */ +} + /* * Drop all pending requests that are stalled waiting on a full condition to * clear, and complete them with ENOSPC as the return code. Set the @@ -2441,61 +2465,10 @@ EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); */ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) { - struct rb_node *n; bool victims = false; - dout("enter abort_on_full\n"); - - if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc)) - goto out; - - /* Scan list and see if there is anything to abort */ - for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { - struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); - struct rb_node *m; - - m = rb_first(&osd->o_requests); - while (m) { - struct ceph_osd_request *req = rb_entry(m, - struct ceph_osd_request, r_node); - m = rb_next(m); - - if (req->r_abort_on_full) { - victims = true; - break; - } - } - if (victims) - break; - } - - if (!victims) - goto out; - - /* - * Update the barrier to current epoch if it's behind that point, - * since we know we have some calls to be aborted in the tree. - */ - update_epoch_barrier(osdc, osdc->osdmap->epoch); - - for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { - struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); - struct rb_node *m; - - m = rb_first(&osd->o_requests); - while (m) { - struct ceph_osd_request *req = rb_entry(m, - struct ceph_osd_request, r_node); - m = rb_next(m); - - if (req->r_abort_on_full && - (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || - pool_full(osdc, req->r_t.target_oloc.pool))) - abort_request(req, -ENOSPC); - } - } -out: - dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier); + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc)) + for_each_request(osdc, abort_on_full_fn, &victims); } static void check_pool_dne(struct ceph_osd_request *req) -- cgit v1.2.3 From 29e878201ee635940ba018bce51f4ee0f0e47a5b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 17 May 2018 16:13:07 +0200 Subject: libceph: don't warn if req->r_abort_on_full is set The "FULL or reached pool quota" warning is there to explain paused requests. No need to emit it if pausing isn't going to occur. Signed-off-by: Ilya Dryomov Acked-by: Jeff Layton Reviewed-by: "Yan, Zheng" --- net/ceph/osd_client.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index be274ab43d01..34b5334548c3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2239,11 +2239,13 @@ again: (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { dout("req %p full/pool_full\n", req); - pr_warn_ratelimited("FULL or reached pool quota\n"); - req->r_t.paused = true; - maybe_request_map(osdc); - if (req->r_abort_on_full) + if (req->r_abort_on_full) { err = -ENOSPC; + } else { + pr_warn_ratelimited("FULL or reached pool quota\n"); + req->r_t.paused = true; + maybe_request_map(osdc); + } } else if (!osd_homeless(osd)) { need_send = true; } else { -- cgit v1.2.3 From 6001567c14eb8e93f8bceb35fc02158a3e1f20f8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 22 May 2018 16:26:51 +0200 Subject: libceph: avoid a use-after-free during map check Sending map check after complete_request() was called is not only useless, but can lead to a use-after-free as req->r_kref decrement in __complete_request() races with map check code. Signed-off-by: Ilya Dryomov Acked-by: Jeff Layton Reviewed-by: "Yan, Zheng" --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 34b5334548c3..294320400c72 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2266,7 +2266,7 @@ again: complete_request(req, err); mutex_unlock(&osd->lock); - if (ct_res == CALC_TARGET_POOL_DNE) + if (!err && ct_res == CALC_TARGET_POOL_DNE) send_map_check(req); if (promoted) -- cgit v1.2.3 From 690f951d7eb8c0529f8a367a3db9cfbfde624db4 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 30 May 2018 14:58:25 +0200 Subject: libceph: don't abort reads in ceph_osdc_abort_on_full() Don't consider reads for aborting and use ->base_oloc instead of ->target_oloc, as done in __submit_request(). Strictly speaking, we shouldn't be aborting FULL_TRY/FULL_FORCE writes either. But, there is an inconsistency in FULL_TRY/FULL_FORCE handling on the OSD side [1], so given that neither of these is used in the kernel client, leave it for when the OSD behaviour is sorted out. [1] http://tracker.ceph.com/issues/24339 Signed-off-by: Ilya Dryomov Acked-by: Jeff Layton Reviewed-by: "Yan, Zheng" --- net/ceph/osd_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 294320400c72..3d055529189c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2447,8 +2447,9 @@ static int abort_on_full_fn(struct ceph_osd_request *req, void *arg) bool *victims = arg; if (req->r_abort_on_full && + (req->r_flags & CEPH_OSD_FLAG_WRITE) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || - pool_full(osdc, req->r_t.target_oloc.pool))) { + pool_full(osdc, req->r_t.base_oloc.pool))) { if (!*victims) { update_epoch_barrier(osdc, osdc->osdmap->epoch); *victims = true; -- cgit v1.2.3 From c843d13caefad9f2f182f38d6bfe492c9f00e086 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 30 May 2018 16:29:14 +0200 Subject: libceph: make abort_on_full a per-osdc setting The intent behind making it a per-request setting was that it would be set for writes, but not for reads. As it is, the flag is set for all fs/ceph requests except for pool perm check stat request (technically a read). ceph_osdc_abort_on_full() skips reads since the previous commit and I don't see a use case for marking individual requests. Signed-off-by: Ilya Dryomov Acked-by: Jeff Layton Reviewed-by: "Yan, Zheng" --- fs/ceph/addr.c | 1 - fs/ceph/file.c | 1 - fs/ceph/super.c | 2 ++ include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 9 ++++----- 5 files changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5f7ad3d0df2e..ca0d5510ed50 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1935,7 +1935,6 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); wr_req->r_mtime = ci->vfs_inode.i_mtime; - wr_req->r_abort_on_full = true; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index cf0e45b10121..6b9f7f3cd237 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -895,7 +895,6 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_callback = ceph_aio_complete_req; req->r_inode = inode; req->r_priv = aio_req; - req->r_abort_on_full = true; ret = ceph_osdc_start_request(req->r_osdc, req, false); out: diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a092cdb69288..cad046aa4fd0 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -616,7 +616,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, err = PTR_ERR(fsc->client); goto fail; } + fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->osdc.abort_on_full = true; if (!fsopt->mds_namespace) { ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d4191bde95a4..0d6ee04b4c41 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -202,7 +202,6 @@ struct ceph_osd_request { struct timespec r_mtime; /* ditto */ u64 r_data_offset; /* ditto */ bool r_linger; /* don't resend on failure */ - bool r_abort_on_full; /* return ENOSPC when full */ /* internal */ unsigned long r_stamp; /* jiffies, send or check time */ @@ -348,6 +347,7 @@ struct ceph_osd_client { struct rb_root linger_map_checks; atomic_t num_requests; atomic_t num_homeless; + bool abort_on_full; /* abort w/ ENOSPC when full */ int abort_err; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3d055529189c..05c4d27d25fe 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1030,7 +1030,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size, truncate_seq); } - req->r_abort_on_full = true; req->r_flags = flags; req->r_base_oloc.pool = layout->pool_id; req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); @@ -2239,7 +2238,7 @@ again: (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { dout("req %p full/pool_full\n", req); - if (req->r_abort_on_full) { + if (osdc->abort_on_full) { err = -ENOSPC; } else { pr_warn_ratelimited("FULL or reached pool quota\n"); @@ -2446,8 +2445,7 @@ static int abort_on_full_fn(struct ceph_osd_request *req, void *arg) struct ceph_osd_client *osdc = req->r_osdc; bool *victims = arg; - if (req->r_abort_on_full && - (req->r_flags & CEPH_OSD_FLAG_WRITE) && + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { if (!*victims) { @@ -2470,7 +2468,8 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) { bool victims = false; - if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc)) + if (osdc->abort_on_full && + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc))) for_each_request(osdc, abort_on_full_fn, &victims); } -- cgit v1.2.3 From a86f009f106cba322c608785e09c8b5be8ffe8bb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 23 May 2018 14:46:53 +0200 Subject: libceph: allocate the locator string with GFP_NOFAIL calc_target() isn't supposed to fail with anything but POOL_DNE, in which case we report that the pool doesn't exist and fail the request with -ENOENT. Doing this for -ENOMEM is at the very least confusing and also harmful -- as the preceding requests complete, a short-lived locator string allocation is likely to succeed after a wait. (We used to call ceph_object_locator_to_pg() for a pi lookup. In theory that could fail with -ENOENT, hence the "ret != -ENOENT" warning being removed.) Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 8 ++++---- net/ceph/osd_client.c | 10 +--------- net/ceph/osdmap.c | 19 ++++++++----------- 3 files changed, 13 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index e71fb222c7c3..5675b1f09bc5 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -279,10 +279,10 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, bool any_change); -int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, - const struct ceph_object_id *oid, - const struct ceph_object_locator *oloc, - struct ceph_pg *raw_pgid); +void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid); int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, const struct ceph_object_id *oid, const struct ceph_object_locator *oloc, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 05c4d27d25fe..f2584fe1246f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1430,7 +1430,6 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, bool recovery_deletes = ceph_osdmap_flag(osdc, CEPH_OSDMAP_RECOVERY_DELETES); enum calc_target_result ct_res; - int ret; t->epoch = osdc->osdmap->epoch; pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); @@ -1466,14 +1465,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, } } - ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, - &pgid); - if (ret) { - WARN_ON(ret != -ENOENT); - t->osd = CEPH_HOMELESS_OSD; - ct_res = CALC_TARGET_POOL_DNE; - goto out; - } + __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid); last_pgid.pool = pgid.pool; last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 9645ffd6acfb..a7494f623451 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2145,10 +2145,10 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting, * Should only be called with target_oid and target_oloc (as opposed to * base_oid and base_oloc), since tiering isn't taken into account. */ -int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, - const struct ceph_object_id *oid, - const struct ceph_object_locator *oloc, - struct ceph_pg *raw_pgid) +void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid) { WARN_ON(pi->id != oloc->pool); @@ -2164,11 +2164,8 @@ int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, int nsl = oloc->pool_ns->len; size_t total = nsl + 1 + oid->name_len; - if (total > sizeof(stack_buf)) { - buf = kmalloc(total, GFP_NOIO); - if (!buf) - return -ENOMEM; - } + if (total > sizeof(stack_buf)) + buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL); memcpy(buf, oloc->pool_ns->str, nsl); buf[nsl] = '\037'; memcpy(buf + nsl + 1, oid->name, oid->name_len); @@ -2180,7 +2177,6 @@ int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, oid->name, nsl, oloc->pool_ns->str, raw_pgid->pool, raw_pgid->seed); } - return 0; } int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, @@ -2194,7 +2190,8 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, if (!pi) return -ENOENT; - return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); + __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); + return 0; } EXPORT_SYMBOL(ceph_object_locator_to_pg); -- cgit v1.2.3