From 3ff5f385b1449a07372d51fb89ca94dbfb6a3be2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 15 Feb 2013 22:10:17 -0600 Subject: libceph: fix a osd request memory leak If an invalid layout is provided to ceph_osdc_new_request(), its call to calc_layout() might return an error. At that point in the function we've already allocated an osd request structure, so we need to free it (drop a reference) in the event such an error occurs. The only other value calc_layout() will return is 0, so make that explicit in the successful case. This resolves: http://tracker.ceph.com/issues/4240 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d730dd4d8eb2..cf4e15bfe0db 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -109,7 +109,7 @@ static int calc_layout(struct ceph_vino vino, snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); - return r; + return 0; } /* @@ -470,8 +470,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, /* calculate max write size */ r = calc_layout(vino, layout, off, plen, req, ops); - if (r < 0) + if (r < 0) { + ceph_osdc_put_request(req); return ERR_PTR(r); + } req->r_file_layout = *layout; /* keep a copy */ /* in case it differs from natural (file) alignment that -- cgit v1.2.3 From 07c09b725543ff2958c11522d583f90f7fdba735 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 15 Feb 2013 22:10:17 -0600 Subject: libceph: make ceph_msg->bio_seg be unsigned The bio_seg field is used by the ceph messenger in iterating through a bio. It should never have a negative value, so make it an unsigned. (I contemplated making it unsigned short to match the struct bio definition, but it offered no benefit.) Change variables used to hold bio_seg values to all be unsigned as well. Change two variable names in init_bio_iter() to match the convention used everywhere else. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 60903e0f665c..8297288a66e0 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -86,7 +86,7 @@ struct ceph_msg { #ifdef CONFIG_BLOCK struct bio *bio; /* instead of pages/pagelist */ struct bio *bio_iter; /* bio iterator */ - int bio_seg; /* current bio segment */ + unsigned int bio_seg; /* current bio segment */ #endif /* CONFIG_BLOCK */ struct ceph_pagelist *trail; /* the trailing part of the data */ bool front_is_vmalloc; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2c0669fb54e3..c06f94009d73 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -697,18 +697,19 @@ static void con_out_kvec_add(struct ceph_connection *con, } #ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +static void init_bio_iter(struct bio *bio, struct bio **bio_iter, + unsigned int *bio_seg) { if (!bio) { - *iter = NULL; - *seg = 0; + *bio_iter = NULL; + *bio_seg = 0; return; } - *iter = bio; - *seg = bio->bi_idx; + *bio_iter = bio; + *bio_seg = (unsigned int) bio->bi_idx; } -static void iter_bio_next(struct bio **bio_iter, int *seg) +static void iter_bio_next(struct bio **bio_iter, unsigned int *seg) { if (*bio_iter == NULL) return; @@ -1818,7 +1819,8 @@ static int read_partial_message_pages(struct ceph_connection *con, #ifdef CONFIG_BLOCK static int read_partial_message_bio(struct ceph_connection *con, - struct bio **bio_iter, int *bio_seg, + struct bio **bio_iter, + unsigned int *bio_seg, unsigned int data_len, bool do_datacrc) { struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); -- cgit v1.2.3 From 47a05811b656915789bdd4c7e8cc18007e09c56d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 15 Feb 2013 22:10:17 -0600 Subject: libceph: pass object number back to calc_layout() caller Have calc_layout() pass the computed object number back to its caller. (This is a small step to simplify review.) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cf4e15bfe0db..f4bdb6a69588 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -67,16 +67,15 @@ static int calc_layout(struct ceph_vino vino, struct ceph_file_layout *layout, u64 off, u64 *plen, struct ceph_osd_request *req, - struct ceph_osd_req_op *op) + struct ceph_osd_req_op *op, u64 *bno) { u64 orig_len = *plen; - u64 bno = 0; u64 objoff = 0; u64 objlen = 0; int r; /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno, + r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, &objoff, &objlen); if (r < 0) return r; @@ -104,9 +103,9 @@ static int calc_layout(struct ceph_vino vino, op->payload_len = *plen; dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", - bno, objoff, objlen, req->r_num_pages); + *bno, objoff, objlen, req->r_num_pages); - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, *bno); req->r_oid_len = strlen(req->r_oid); return 0; @@ -449,6 +448,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_osd_req_op ops[2]; struct ceph_osd_request *req; unsigned int num_op = 1; + u64 bno = 0; int r; memset(&ops, 0, sizeof ops); @@ -469,11 +469,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_flags = flags; /* calculate max write size */ - r = calc_layout(vino, layout, off, plen, req, ops); + r = calc_layout(vino, layout, off, plen, req, ops, &bno); if (r < 0) { ceph_osdc_put_request(req); return ERR_PTR(r); } + req->r_file_layout = *layout; /* keep a copy */ /* in case it differs from natural (file) alignment that -- cgit v1.2.3 From dbe0fc4188ee568d6e26fe938a653f01e18d6f4e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 15 Feb 2013 22:10:17 -0600 Subject: libceph: format target object name in caller Move the formatting of the object name (oid) to use for an object request into the caller of calc_layout(). This makes the "vino" parameter no longer necessary, so get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f4bdb6a69588..df72234e66e4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -63,9 +63,7 @@ static int op_has_extent(int op) * * fill osd op in request message. */ -static int calc_layout(struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, +static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, struct ceph_osd_request *req, struct ceph_osd_req_op *op, u64 *bno) { @@ -105,9 +103,6 @@ static int calc_layout(struct ceph_vino vino, dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", *bno, objoff, objlen, req->r_num_pages); - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, *bno); - req->r_oid_len = strlen(req->r_oid); - return 0; } @@ -469,7 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_flags = flags; /* calculate max write size */ - r = calc_layout(vino, layout, off, plen, req, ops, &bno); + r = calc_layout(layout, off, plen, req, ops, &bno); if (r < 0) { ceph_osdc_put_request(req); return ERR_PTR(r); @@ -477,6 +472,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_file_layout = *layout; /* keep a copy */ + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); + req->r_oid_len = strlen(req->r_oid); + /* in case it differs from natural (file) alignment that calc_layout filled in for us */ req->r_num_pages = calc_pages_for(page_align, *plen); -- cgit v1.2.3 From 60cf5992d96dd5b97baf74cd400d6e05f7f2c93e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 15 Feb 2013 22:10:17 -0600 Subject: libceph: don't pass request to calc_layout() The only remaining reason to pass the osd request to calc_layout() is to fill in its r_num_pages and r_page_alignment fields. Once it fills those in, it doesn't do anything more with them. We can therefore move those assignments into the caller, and get rid of the "req" parameter entirely. Note, however, that the only caller is ceph_osdc_new_request(), and that immediately overwrites those fields with values based on its passed-in page offset. So the assignment inside calc_layout() was redundant anyway. This resolves: http://tracker.ceph.com/issues/4262 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index df72234e66e4..29e4fe09e31a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -64,7 +64,6 @@ static int op_has_extent(int op) * fill osd op in request message. */ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, - struct ceph_osd_request *req, struct ceph_osd_req_op *op, u64 *bno) { u64 orig_len = *plen; @@ -95,13 +94,10 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, op->extent.truncate_size = osize; } } - req->r_num_pages = calc_pages_for(off, *plen); - req->r_page_alignment = off & ~PAGE_MASK; if (op->op == CEPH_OSD_OP_WRITE) op->payload_len = *plen; - dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", - *bno, objoff, objlen, req->r_num_pages); + dout("calc_layout bno=%llx %llu~%llu\n", *bno, objoff, objlen); return 0; } @@ -464,7 +460,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_flags = flags; /* calculate max write size */ - r = calc_layout(layout, off, plen, req, ops, &bno); + r = calc_layout(layout, off, plen, ops, &bno); if (r < 0) { ceph_osdc_put_request(req); return ERR_PTR(r); @@ -475,8 +471,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); - /* in case it differs from natural (file) alignment that - calc_layout filled in for us */ + /* The alignment may differ from the natural (file) alignment */ + req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; -- cgit v1.2.3 From d4b515fa10dd52a2aef88df7299e9f3a8ab0957a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Feb 2013 17:35:46 -0600 Subject: libceph: distinguish page array and pagelist count Use distinct fields for tracking the number of pages in a message's page array and in a message's page list. Currently only one or the other is used at a time, but that will be changing soon. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/mds_client.c | 4 ++-- include/linux/ceph/messenger.h | 3 ++- net/ceph/messenger.c | 14 ++++++++------ net/ceph/osd_client.c | 4 ++-- 4 files changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 442880d099c9..5c17705f88b1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1719,7 +1719,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->pages = req->r_pages; - msg->nr_pages = req->r_num_pages; + msg->page_count = req->r_num_pages; msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); @@ -2600,10 +2600,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, } reply->pagelist = pagelist; + reply->pagelist_count = calc_pages_for(0, pagelist->length); if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); reply->hdr.data_len = cpu_to_le32(pagelist->length); - reply->nr_pages = calc_pages_for(0, pagelist->length); ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 8297288a66e0..1b08349a413c 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -75,9 +75,10 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; struct page **pages; /* data payload. NOT OWNER. */ - unsigned nr_pages; /* size of page array */ + unsigned page_count; /* size of page array */ unsigned page_alignment; /* io offset in first page */ struct ceph_pagelist *pagelist; /* instead of pages */ + unsigned int pagelist_count; /* number of pages in pagelist */ struct ceph_connection *con; struct list_head list_head; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index c06f94009d73..9d8abb0a7cef 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -813,7 +813,7 @@ static void prepare_write_message(struct ceph_connection *con) m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), le32_to_cpu(m->hdr.data_len), - m->nr_pages); + m->page_count); BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ @@ -1072,7 +1072,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) const size_t trail_off = data_len - trail_len; dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, msg, con->out_msg_pos.page, msg->nr_pages, + con, msg, con->out_msg_pos.page, msg->page_count, con->out_msg_pos.page_pos); /* @@ -2715,9 +2715,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, m->middle = NULL; /* data */ - m->nr_pages = 0; + m->page_count = 0; m->page_alignment = 0; m->pages = NULL; + m->pagelist_count = 0; m->pagelist = NULL; #ifdef CONFIG_BLOCK m->bio = NULL; @@ -2890,13 +2891,14 @@ void ceph_msg_last_put(struct kref *kref) ceph_buffer_put(m->middle); m->middle = NULL; } - m->nr_pages = 0; + m->page_count = 0; m->pages = NULL; if (m->pagelist) { ceph_pagelist_release(m->pagelist); kfree(m->pagelist); m->pagelist = NULL; + m->pagelist_count = 0; } m->trail = NULL; @@ -2910,8 +2912,8 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, - msg->front_max, msg->nr_pages); + pr_debug("msg_dump %p (front_max %d page_count %d)\n", msg, + msg->front_max, msg->page_count); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 29e4fe09e31a..c3d8c6904df3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1742,7 +1742,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, int rc = 0; req->r_request->pages = req->r_pages; - req->r_request->nr_pages = req->r_num_pages; + req->r_request->page_count = req->r_num_pages; #ifdef CONFIG_BLOCK req->r_request->bio = req->r_bio; #endif @@ -2093,7 +2093,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, goto out; } m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; + m->page_count = req->r_num_pages; m->page_alignment = req->r_page_alignment; #ifdef CONFIG_BLOCK m->bio = req->r_bio; -- cgit v1.2.3 From f51a822c315e9d4c4c67247bea10e4b8eb795af1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: set page alignment in start_request() The page alignment field for a request is currently set in ceph_osdc_build_request(). It's not needed at that point nor do either of its callers need that value assigned at any point before they call ceph_osdc_start_request(). So move that assignment into ceph_osdc_start_request(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index c3d8c6904df3..1d9ebf967b00 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -399,7 +399,6 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, data_len += len; } req->r_request->hdr.data_len = cpu_to_le32(data_len); - req->r_request->page_alignment = req->r_page_alignment; BUG_ON(p > msg->front.iov_base + msg->front.iov_len); msg_size = p - msg->front.iov_base; @@ -1743,6 +1742,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, req->r_request->pages = req->r_pages; req->r_request->page_count = req->r_num_pages; + req->r_request->page_alignment = req->r_page_alignment; #ifdef CONFIG_BLOCK req->r_request->bio = req->r_bio; #endif -- cgit v1.2.3 From 0d5af1643535508f82d6bcc2b9b93b180e8c3f4b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 27 Feb 2013 10:26:25 -0600 Subject: libceph: complete lingering requests only once An osd request marked to linger will be re-submitted in the event a connection to the target osd gets dropped. Currently, if there is a callback function associated with a request it will be called each time a request is submitted--which for lingering requests can be more than once. Change it so a request--including lingering ones--will get completed (from the perspective of the user of the osd client) exactly once. This resolves: http://tracker.ceph.com/issues/3967 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 + net/ceph/osd_client.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 1dd5d466b6f9..a79f833bba4a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -85,6 +85,7 @@ struct ceph_osd_request { s32 r_reply_op_result[CEPH_OSD_MAX_OP]; int r_got_reply; int r_linger; + int r_completed; struct ceph_osd_client *r_osdc; struct kref r_kref; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1d9ebf967b00..a28c976ae3ae 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1174,6 +1174,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, u32 reassert_epoch; u64 reassert_version; u32 osdmap_epoch; + int already_completed; int i; tid = le64_to_cpu(msg->hdr.tid); @@ -1282,7 +1283,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, ((flags & CEPH_OSD_FLAG_WRITE) == 0)) __unregister_request(osdc, req); + already_completed = req->r_completed; + req->r_completed = 1; mutex_unlock(&osdc->request_mutex); + if (already_completed) + goto done; if (req->r_callback) req->r_callback(req, msg); -- cgit v1.2.3 From 8f63ca2d23c7922b24d7b95e54740ec29c859379 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 4 Mar 2013 11:08:29 -0600 Subject: libceph: fix wrong opcode use in osd_req_encode_op() The new cases added to osd_req_encode_op() caused a new sparse error, which highlighted an existing problem that had been overlooked since it was originally checked in. When an unsupported opcode is found the destination rather than the source opcode was being used in the error message. The two differ in their byte order, and we want to be using the one in the source. Fix the problem in both spots. Reported-by: Fengguang Wu Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a28c976ae3ae..d7ce457c59d9 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -249,7 +249,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, dst->watch.flag = src->watch.flag; break; default: - pr_err("unrecognized osd opcode %d\n", dst->op); + pr_err("unrecognized osd opcode %d\n", src->op); WARN_ON(1); break; case CEPH_OSD_OP_MAPEXT: @@ -307,7 +307,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, case CEPH_OSD_OP_PGLS: case CEPH_OSD_OP_PGLS_FILTER: pr_err("unsupported osd opcode %s\n", - ceph_osd_op_name(dst->op)); + ceph_osd_op_name(src->op)); WARN_ON(1); break; } -- cgit v1.2.3 From ec02a2f2ffae13e038453ae89592a8c6210f7f4d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:15 -0600 Subject: libceph: kill ceph_msg->pagelist_count The pagelist_count field is never actually used, so get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/mds_client.c | 1 - include/linux/ceph/messenger.h | 1 - net/ceph/messenger.c | 2 -- 3 files changed, 4 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9811caae7be4..4efbc63e0bb6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2604,7 +2604,6 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, } reply->pagelist = pagelist; - reply->pagelist_count = calc_pages_for(0, pagelist->length); if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); reply->hdr.data_len = cpu_to_le32(pagelist->length); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 1b08349a413c..6c118748a7f8 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -78,7 +78,6 @@ struct ceph_msg { unsigned page_count; /* size of page array */ unsigned page_alignment; /* io offset in first page */ struct ceph_pagelist *pagelist; /* instead of pages */ - unsigned int pagelist_count; /* number of pages in pagelist */ struct ceph_connection *con; struct list_head list_head; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9d8abb0a7cef..0f9933a5a8b0 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2718,7 +2718,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, m->page_count = 0; m->page_alignment = 0; m->pages = NULL; - m->pagelist_count = 0; m->pagelist = NULL; #ifdef CONFIG_BLOCK m->bio = NULL; @@ -2898,7 +2897,6 @@ void ceph_msg_last_put(struct kref *kref) ceph_pagelist_release(m->pagelist); kfree(m->pagelist); m->pagelist = NULL; - m->pagelist_count = 0; } m->trail = NULL; -- cgit v1.2.3 From 41766f87f54cc8bef023b4b0550f48753959345a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:15 -0600 Subject: libceph: rename ceph_calc_object_layout() The purpose of ceph_calc_object_layout() is to fill in the pool number and seed for a ceph_pg structure provided, based on a given osd map and target object id. Currently that function takes a file layout parameter, but the only thing used out of that is its pool number. Change the function so it takes a pool number rather than the full file layout structure. Only update the ceph_pg if the pool is found in the osd map. Get rid of few useless lines of code from the function while there. Since the function now very clearly just fills in the ceph_pg structure it's provided, rename it ceph_calc_ceph_pg(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/ioctl.c | 5 +++-- include/linux/ceph/osdmap.h | 6 ++---- net/ceph/osd_client.c | 4 ++-- net/ceph/osdmap.c | 23 +++++++++-------------- 4 files changed, 16 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 4a989345b37b..e0b4ef31d3c8 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -208,8 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout, - osdc->osdmap); + + ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, + ceph_file_layout_pg_pool(ci->i_layout)); dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index c819190d1642..167daf60c4e8 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -131,10 +131,8 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ -extern int ceph_calc_object_layout(struct ceph_pg *pg, - const char *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap); +extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, + struct ceph_osdmap *osdmap, uint64_t pool); extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, int *acting); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d7ce457c59d9..38d09d13bb15 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -948,8 +948,8 @@ static int __map_request(struct ceph_osd_client *osdc, int err; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_object_layout(&pgid, req->r_oid, - &req->r_file_layout, osdc->osdmap); + err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, + ceph_file_layout_pg_pool(req->r_file_layout)); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4543b9aba40c..09898711f2fd 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1111,27 +1111,22 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); * calculate an object layout (i.e. pgid) from an oid, * file_layout, and osdmap */ -int ceph_calc_object_layout(struct ceph_pg *pg, - const char *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap) +int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, + struct ceph_osdmap *osdmap, uint64_t pool) { - unsigned int num, num_mask; - struct ceph_pg_pool_info *pool; + struct ceph_pg_pool_info *pool_info; BUG_ON(!osdmap); - pg->pool = le32_to_cpu(fl->fl_pg_pool); - pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool); - if (!pool) + pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); + if (!pool_info) return -EIO; - pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid)); - num = pool->pg_num; - num_mask = pool->pg_num_mask; + pg->pool = pool; + pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); - dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed); + dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); return 0; } -EXPORT_SYMBOL(ceph_calc_object_layout); +EXPORT_SYMBOL(ceph_calc_ceph_pg); /* * Calculate raw osd vector for the given pgid. Return pointer to osd -- cgit v1.2.3 From 1d866d1c31110db177cbd0636b95c4cb32ca2c6e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:14 -0600 Subject: libceph: drop mutex while allocating a message In ceph_con_in_msg_alloc(), if no alloc_msg method is defined for a connection a new message is allocated with ceph_msg_new(). Drop the mutex before making this call, and make sure we're still connected when we get it back again. This is preparing for the next patch, which ensures all connections define an alloc_msg method, and then handles them all the same way. Signed-off-by: Alex Elder Reviewed-by: Greg Farnum --- net/ceph/messenger.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0f9933a5a8b0..6ec6051e1672 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2807,13 +2807,12 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) int type = le16_to_cpu(hdr->type); int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); + struct ceph_msg *msg; int ret = 0; BUG_ON(con->in_msg != NULL); if (con->ops->alloc_msg) { - struct ceph_msg *msg; - mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); @@ -2838,12 +2837,19 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) } } if (!con->in_msg) { - con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); - if (!con->in_msg) { + mutex_unlock(&con->mutex); + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + mutex_lock(&con->mutex); + if (!msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); return -ENOMEM; } + if (con->state != CON_STATE_OPEN) { + ceph_msg_put(msg); + return -EAGAIN; + } + con->in_msg = msg; con->in_msg->con = con->ops->get(con); BUG_ON(con->in_msg->con == NULL); con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); -- cgit v1.2.3 From 53ded495c6ac9f79d9a7f91bac92ba977944306c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:14 -0600 Subject: libceph: define mds_alloc_msg() method The only user of the ceph messenger that doesn't define an alloc_msg method is the mds client. Define one, such that it works just like it did before, and simplify ceph_con_in_msg_alloc() by assuming the alloc_msg method is always present. This and the next patch resolve: http://tracker.ceph.com/issues/4322 Signed-off-by: Alex Elder Reviewed-by: Greg Farnum --- fs/ceph/mds_client.c | 23 ++++++++++++++++++++ net/ceph/messenger.c | 59 +++++++++++++++++----------------------------------- 2 files changed, 42 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4efbc63e0bb6..b87b24fed4b2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3473,6 +3473,28 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } +static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) +{ + struct ceph_msg *msg; + int type = (int) le16_to_cpu(hdr->type); + int front_len = (int) le32_to_cpu(hdr->front_len); + + if (con->in_msg) + return con->in_msg; + + *skip = 0; + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return NULL; + } + msg->page_alignment = (unsigned int) le16_to_cpu(hdr->data_off); + + return msg; +} + static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, @@ -3481,6 +3503,7 @@ static const struct ceph_connection_operations mds_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, + .alloc_msg = mds_alloc_msg, }; /* eof */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6ec6051e1672..c7d427876dbc 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2804,55 +2804,34 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) { struct ceph_msg_header *hdr = &con->in_hdr; - int type = le16_to_cpu(hdr->type); - int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); struct ceph_msg *msg; int ret = 0; BUG_ON(con->in_msg != NULL); + BUG_ON(!con->ops->alloc_msg); - if (con->ops->alloc_msg) { - mutex_unlock(&con->mutex); - msg = con->ops->alloc_msg(con, hdr, skip); - mutex_lock(&con->mutex); - if (con->state != CON_STATE_OPEN) { - if (msg) - ceph_msg_put(msg); - return -EAGAIN; - } - con->in_msg = msg; - if (con->in_msg) { - con->in_msg->con = con->ops->get(con); - BUG_ON(con->in_msg->con == NULL); - } - if (*skip) { - con->in_msg = NULL; - return 0; - } - if (!con->in_msg) { - con->error_msg = - "error allocating memory for incoming message"; - return -ENOMEM; - } - } - if (!con->in_msg) { - mutex_unlock(&con->mutex); - msg = ceph_msg_new(type, front_len, GFP_NOFS, false); - mutex_lock(&con->mutex); - if (!msg) { - pr_err("unable to allocate msg type %d len %d\n", - type, front_len); - return -ENOMEM; - } - if (con->state != CON_STATE_OPEN) { + mutex_unlock(&con->mutex); + msg = con->ops->alloc_msg(con, hdr, skip); + mutex_lock(&con->mutex); + if (con->state != CON_STATE_OPEN) { + if (msg) ceph_msg_put(msg); - return -EAGAIN; - } - con->in_msg = msg; + return -EAGAIN; + } + con->in_msg = msg; + if (con->in_msg) { con->in_msg->con = con->ops->get(con); BUG_ON(con->in_msg->con == NULL); - con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); + } + if (*skip) { + con->in_msg = NULL; + return 0; + } + if (!con->in_msg) { + con->error_msg = + "error allocating memory for incoming message"; + return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); -- cgit v1.2.3 From 153e5167e0e237faaefb7adf82db5748c1452d73 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:15 -0600 Subject: libceph: don't assign page info in ceph_osdc_new_request() Currently ceph_osdc_new_request() assigns an osd request's r_num_pages and r_alignment fields. The only thing it does after that is call ceph_osdc_build_request(), and that doesn't need those fields to be assigned. Move the assignment of those fields out of ceph_osdc_new_request() and into its caller. As a result, the page_align parameter is no longer used, so get rid of it. Note that in ceph_sync_write(), the value for req->r_num_pages had already been calculated earlier (as num_pages, and fortunately it was computed the same way). So don't bother recomputing it, but because it's not needed earlier, move that calculation after the call to ceph_osdc_new_request(). Hold off making the assignment to r_alignment, doing it instead r_pages and r_num_pages are getting set. Similarly, in start_read(), nr_pages already holds the number of pages in the array (and is calculated the same way), so there's no need to recompute it. Move the assignment of the page alignment down with the others there as well. This and the next few patches are preparation work for: http://tracker.ceph.com/issues/4127 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/addr.c | 7 +++++-- fs/ceph/file.c | 9 +++++---- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 19 ++++++++----------- 4 files changed, 19 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e53f24b15b12..e324222acc82 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -309,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, ci->i_truncate_seq, ci->i_truncate_size, - NULL, false, 0); + NULL, false); if (IS_ERR(req)) return PTR_ERR(req); @@ -338,6 +338,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } req->r_pages = pages; req->r_num_pages = nr_pages; + req->r_page_alignment = 0; req->r_callback = finish_read; req->r_inode = inode; @@ -820,7 +821,7 @@ get_more_pages: snapc, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, true, 0); + &inode->i_mtime, true); if (IS_ERR(req)) { rc = PTR_ERR(req); @@ -828,6 +829,8 @@ get_more_pages: break; } + req->r_num_pages = calc_pages_for(0, len); + req->r_page_alignment = 0; max_pages = req->r_num_pages; alloc_page_vec(fsc, req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 146ac9040141..f2754cdb5a03 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -527,19 +527,19 @@ more: buf_align = (unsigned long)data & ~PAGE_MASK; len = left; - /* write from beginning of first page, regardless of io alignment */ - page_align = file->f_flags & O_DIRECT ? buf_align : io_align; - num_pages = calc_pages_for(page_align, len); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, ci->i_snap_realm->cached_context, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, page_align); + &mtime, false); if (IS_ERR(req)) return PTR_ERR(req); + /* write from beginning of first page, regardless of io alignment */ + page_align = file->f_flags & O_DIRECT ? buf_align : io_align; + num_pages = calc_pages_for(page_align, len); if (file->f_flags & O_DIRECT) { pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { @@ -573,6 +573,7 @@ more: } req->r_pages = pages; req->r_num_pages = num_pages; + req->r_page_alignment = page_align; req->r_inode = inode; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ec33588194ef..803a9db0b475 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -247,7 +247,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, int do_sync, u32 truncate_seq, u64 truncate_size, struct timespec *mtime, - bool use_mempool, int page_align); + bool use_mempool); extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, struct ceph_osd_request *req); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 38d09d13bb15..de427cc7f6d0 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -432,8 +432,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, u32 truncate_seq, u64 truncate_size, struct timespec *mtime, - bool use_mempool, - int page_align) + bool use_mempool) { struct ceph_osd_req_op ops[2]; struct ceph_osd_request *req; @@ -470,11 +469,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); - /* The alignment may differ from the natural (file) alignment */ - - req->r_num_pages = calc_pages_for(page_align, *plen); - req->r_page_alignment = page_align; - ceph_osdc_build_request(req, off, *plen, num_op, ops, snapc, vino.snap, mtime); @@ -1945,12 +1939,14 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, req = ceph_osdc_new_request(osdc, layout, vino, off, plen, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, - false, page_align); + false); if (IS_ERR(req)) return PTR_ERR(req); /* it may be a short read due to an object boundary */ req->r_pages = pages; + req->r_num_pages = calc_pages_for(page_align, *plen); + req->r_page_alignment = page_align; dout("readpages final extent is %llu~%llu (%d pages align %d)\n", off, *plen, req->r_num_pages, page_align); @@ -1986,14 +1982,15 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, snapc, 0, truncate_seq, truncate_size, mtime, - true, page_align); + true); if (IS_ERR(req)) return PTR_ERR(req); /* it may be a short write due to an object boundary */ req->r_pages = pages; - dout("writepages %llu~%llu (%d pages)\n", off, len, - req->r_num_pages); + req->r_num_pages = calc_pages_for(page_align, len); + req->r_page_alignment = page_align; + dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_num_pages); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) -- cgit v1.2.3 From 2794a82a11cfeae0890741b18b0049ddb55ce646 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: separate osd request data info Pull the fields in an osd request structure that define the data for the request out into a separate structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 +++--- fs/ceph/addr.c | 55 +++++++++++++++++++++-------------------- fs/ceph/file.c | 8 +++--- include/linux/ceph/osd_client.h | 24 ++++++++++++------ net/ceph/osd_client.c | 44 ++++++++++++++++----------------- 5 files changed, 74 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b7b7a88d9f68..0e814dfda48e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1425,12 +1425,12 @@ static struct ceph_osd_request *rbd_osd_req_create( break; /* Nothing to do */ case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); - osd_req->r_bio = obj_request->bio_list; + osd_req->r_data.bio = obj_request->bio_list; break; case OBJ_REQUEST_PAGES: - osd_req->r_pages = obj_request->pages; - osd_req->r_num_pages = obj_request->page_count; - osd_req->r_page_alignment = offset & ~PAGE_MASK; + osd_req->r_data.pages = obj_request->pages; + osd_req->r_data.num_pages = obj_request->page_count; + osd_req->r_data.alignment = offset & ~PAGE_MASK; break; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e324222acc82..3a1a77b0ae9f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -243,8 +243,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { - struct page *page = req->r_pages[i]; + for (i = 0; i < req->r_data.num_pages; i++, bytes -= PAGE_CACHE_SIZE) { + struct page *page = req->r_data.pages[i]; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ @@ -258,7 +258,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) unlock_page(page); page_cache_release(page); } - kfree(req->r_pages); + kfree(req->r_data.pages); } static void ceph_unlock_page_vector(struct page **pages, int num_pages) @@ -336,9 +336,9 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_pages = pages; - req->r_num_pages = nr_pages; - req->r_page_alignment = 0; + req->r_data.pages = pages; + req->r_data.num_pages = nr_pages; + req->r_data.alignment = 0; req->r_callback = finish_read; req->r_inode = inode; @@ -374,7 +374,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + dout("readpages %p file %p nr_pages %d max %d\n", inode, + file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); @@ -567,7 +568,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_num_pages; + wrote = req->r_data.num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -576,8 +577,8 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_num_pages; i++) { - page = req->r_pages[i]; + for (i = 0; i < req->r_data.num_pages; i++) { + page = req->r_data.pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -606,31 +607,31 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, req->r_data.num_pages, snapc); - ceph_release_pages(req->r_pages, req->r_num_pages); - if (req->r_pages_from_pool) - mempool_free(req->r_pages, + ceph_release_pages(req->r_data.pages, req->r_data.num_pages); + if (req->r_data.pages_from_pool) + mempool_free(req->r_data.pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else - kfree(req->r_pages); + kfree(req->r_data.pages); ceph_osdc_put_request(req); } /* * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_num_pages + * mempool. we avoid the mempool if we can because req->r_data.num_pages * may be less than the maximum write size. */ static void alloc_page_vec(struct ceph_fs_client *fsc, struct ceph_osd_request *req) { - req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, + req->r_data.pages = kmalloc(sizeof(struct page *) * req->r_data.num_pages, GFP_NOFS); - if (!req->r_pages) { - req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); - req->r_pages_from_pool = 1; - WARN_ON(!req->r_pages); + if (!req->r_data.pages) { + req->r_data.pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); + req->r_data.pages_from_pool = 1; + WARN_ON(!req->r_data.pages); } } @@ -829,9 +830,9 @@ get_more_pages: break; } - req->r_num_pages = calc_pages_for(0, len); - req->r_page_alignment = 0; - max_pages = req->r_num_pages; + req->r_data.num_pages = calc_pages_for(0, len); + req->r_data.alignment = 0; + max_pages = req->r_data.num_pages; alloc_page_vec(fsc, req); req->r_callback = writepages_finish; @@ -853,7 +854,7 @@ get_more_pages: } set_page_writeback(page); - req->r_pages[locked_pages] = page; + req->r_data.pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -883,14 +884,14 @@ get_more_pages: } /* submit the write */ - offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; + offset = req->r_data.pages[0]->index << PAGE_CACHE_SHIFT; len = min((snap_size ? snap_size : i_size_read(inode)) - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); /* revise final length, page count */ - req->r_num_pages = locked_pages; + req->r_data.num_pages = locked_pages; req->r_request_ops[0].extent.length = cpu_to_le64(len); req->r_request_ops[0].payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f2754cdb5a03..d35fc05af06f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -568,12 +568,12 @@ more: if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */ req->r_safe_callback = sync_write_commit; - req->r_own_pages = 1; + req->r_data.own_pages = 1; } } - req->r_pages = pages; - req->r_num_pages = num_pages; - req->r_page_alignment = page_align; + req->r_data.pages = pages; + req->r_data.num_pages = num_pages; + req->r_data.alignment = page_align; req->r_inode = inode; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 803a9db0b475..600b8278d11e 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -50,6 +50,21 @@ struct ceph_osd { #define CEPH_OSD_MAX_OP 10 +struct ceph_osd_data { + struct { + struct { + struct page **pages; + u32 num_pages; + u32 alignment; + bool pages_from_pool; + bool own_pages; + }; +#ifdef CONFIG_BLOCK + struct bio *bio; +#endif /* CONFIG_BLOCK */ + }; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ @@ -105,15 +120,8 @@ struct ceph_osd_request { struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ - unsigned r_num_pages; /* size of page array (follows) */ - unsigned r_page_alignment; /* io offset in first page */ - struct page **r_pages; /* pages for data payload */ - int r_pages_from_pool; - int r_own_pages; /* if true, i own page list */ -#ifdef CONFIG_BLOCK - struct bio *r_bio; /* instead of pages */ -#endif + struct ceph_osd_data r_data; struct ceph_pagelist r_trail; /* trailing part of the data */ }; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index de427cc7f6d0..1f8c7a7c203b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -122,9 +122,9 @@ void ceph_osdc_release_request(struct kref *kref) } if (req->r_reply) ceph_msg_put(req->r_reply); - if (req->r_own_pages) - ceph_release_page_vector(req->r_pages, - req->r_num_pages); + if (req->r_data.own_pages) + ceph_release_page_vector(req->r_data.pages, + req->r_data.num_pages); ceph_put_snap_context(req->r_snapc); ceph_pagelist_release(&req->r_trail); if (req->r_mempool) @@ -1739,11 +1739,11 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, { int rc = 0; - req->r_request->pages = req->r_pages; - req->r_request->page_count = req->r_num_pages; - req->r_request->page_alignment = req->r_page_alignment; + req->r_request->pages = req->r_data.pages; + req->r_request->page_count = req->r_data.num_pages; + req->r_request->page_alignment = req->r_data.alignment; #ifdef CONFIG_BLOCK - req->r_request->bio = req->r_bio; + req->r_request->bio = req->r_data.bio; #endif req->r_request->trail = &req->r_trail; @@ -1944,12 +1944,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, return PTR_ERR(req); /* it may be a short read due to an object boundary */ - req->r_pages = pages; - req->r_num_pages = calc_pages_for(page_align, *plen); - req->r_page_alignment = page_align; + req->r_data.pages = pages; + req->r_data.num_pages = calc_pages_for(page_align, *plen); + req->r_data.alignment = page_align; dout("readpages final extent is %llu~%llu (%d pages align %d)\n", - off, *plen, req->r_num_pages, page_align); + off, *plen, req->r_data.num_pages, page_align); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -1987,10 +1987,10 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ - req->r_pages = pages; - req->r_num_pages = calc_pages_for(page_align, len); - req->r_page_alignment = page_align; - dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_num_pages); + req->r_data.pages = pages; + req->r_data.num_pages = calc_pages_for(page_align, len); + req->r_data.alignment = page_align; + dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_data.num_pages); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) @@ -2083,22 +2083,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - int want = calc_pages_for(req->r_page_alignment, data_len); + int want = calc_pages_for(req->r_data.alignment, data_len); - if (req->r_pages && unlikely(req->r_num_pages < want)) { + if (req->r_data.pages && unlikely(req->r_data.num_pages < want)) { pr_warning("tid %lld reply has %d bytes %d pages, we" " had only %d pages ready\n", tid, data_len, - want, req->r_num_pages); + want, req->r_data.num_pages); *skip = 1; ceph_msg_put(m); m = NULL; goto out; } - m->pages = req->r_pages; - m->page_count = req->r_num_pages; - m->page_alignment = req->r_page_alignment; + m->pages = req->r_data.pages; + m->page_count = req->r_data.num_pages; + m->page_alignment = req->r_data.alignment; #ifdef CONFIG_BLOCK - m->bio = req->r_bio; + m->bio = req->r_data.bio; #endif } *skip = 0; -- cgit v1.2.3 From 2ac2b7a6d4976bd6b5dc0751aa77d12d48d3ac4c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: distinguish page and bio requests An osd request uses either pages or a bio list for its data. Use a union to record information about the two, and add a data type tag to select between them. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 +++ fs/ceph/addr.c | 4 +++ fs/ceph/file.c | 1 + include/linux/ceph/osd_client.h | 11 +++++++- net/ceph/osd_client.c | 56 ++++++++++++++++++++++++++--------------- 5 files changed, 55 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0e814dfda48e..f189bc2909b0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1425,12 +1425,16 @@ static struct ceph_osd_request *rbd_osd_req_create( break; /* Nothing to do */ case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); + osd_req->r_data.type = CEPH_OSD_DATA_TYPE_BIO; osd_req->r_data.bio = obj_request->bio_list; break; case OBJ_REQUEST_PAGES: + osd_req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; osd_req->r_data.pages = obj_request->pages; osd_req->r_data.num_pages = obj_request->page_count; osd_req->r_data.alignment = offset & ~PAGE_MASK; + osd_req->r_data.pages_from_pool = false; + osd_req->r_data.own_pages = false; break; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3a1a77b0ae9f..276fe96f12e3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -243,6 +243,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ + BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES); for (i = 0; i < req->r_data.num_pages; i++, bytes -= PAGE_CACHE_SIZE) { struct page *page = req->r_data.pages[i]; @@ -336,6 +337,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.pages = pages; req->r_data.num_pages = nr_pages; req->r_data.alignment = 0; @@ -561,6 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req, long writeback_stat; unsigned issued = ceph_caps_issued(ci); + BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -830,6 +833,7 @@ get_more_pages: break; } + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.num_pages = calc_pages_for(0, len); req->r_data.alignment = 0; max_pages = req->r_data.num_pages; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d35fc05af06f..3643a386ab23 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -571,6 +571,7 @@ more: req->r_data.own_pages = 1; } } + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.pages = pages; req->r_data.num_pages = num_pages; req->r_data.alignment = page_align; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 600b8278d11e..56604b33dc3c 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -50,8 +50,17 @@ struct ceph_osd { #define CEPH_OSD_MAX_OP 10 +enum ceph_osd_data_type { + CEPH_OSD_DATA_TYPE_NONE, + CEPH_OSD_DATA_TYPE_PAGES, +#ifdef CONFIG_BLOCK + CEPH_OSD_DATA_TYPE_BIO, +#endif /* CONFIG_BLOCK */ +}; + struct ceph_osd_data { - struct { + enum ceph_osd_data_type type; + union { struct { struct page **pages; u32 num_pages; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1f8c7a7c203b..591e1b0cccbe 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -122,7 +122,8 @@ void ceph_osdc_release_request(struct kref *kref) } if (req->r_reply) ceph_msg_put(req->r_reply); - if (req->r_data.own_pages) + if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES && + req->r_data.own_pages) ceph_release_page_vector(req->r_data.pages, req->r_data.num_pages); ceph_put_snap_context(req->r_snapc); @@ -188,6 +189,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; + req->r_data.type = CEPH_OSD_DATA_TYPE_NONE; ceph_pagelist_init(&req->r_trail); /* create request message; allow space for oid */ @@ -1739,12 +1741,17 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, { int rc = 0; - req->r_request->pages = req->r_data.pages; - req->r_request->page_count = req->r_data.num_pages; - req->r_request->page_alignment = req->r_data.alignment; + if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) { + req->r_request->pages = req->r_data.pages; + req->r_request->page_count = req->r_data.num_pages; + req->r_request->page_alignment = req->r_data.alignment; #ifdef CONFIG_BLOCK - req->r_request->bio = req->r_data.bio; + } else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) { + req->r_request->bio = req->r_data.bio; #endif + } else { + pr_err("unknown request data type %d\n", req->r_data.type); + } req->r_request->trail = &req->r_trail; register_request(osdc, req); @@ -1944,6 +1951,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, return PTR_ERR(req); /* it may be a short read due to an object boundary */ + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.pages = pages; req->r_data.num_pages = calc_pages_for(page_align, *plen); req->r_data.alignment = page_align; @@ -1987,6 +1995,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.pages = pages; req->r_data.num_pages = calc_pages_for(page_align, len); req->r_data.alignment = page_align; @@ -2083,23 +2092,30 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - int want = calc_pages_for(req->r_data.alignment, data_len); - - if (req->r_data.pages && unlikely(req->r_data.num_pages < want)) { - pr_warning("tid %lld reply has %d bytes %d pages, we" - " had only %d pages ready\n", tid, data_len, - want, req->r_data.num_pages); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; - } - m->pages = req->r_data.pages; - m->page_count = req->r_data.num_pages; - m->page_alignment = req->r_data.alignment; + if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) { + int want; + + want = calc_pages_for(req->r_data.alignment, data_len); + if (req->r_data.pages && + unlikely(req->r_data.num_pages < want)) { + + pr_warning("tid %lld reply has %d bytes %d " + "pages, we had only %d pages ready\n", + tid, data_len, want, + req->r_data.num_pages); + *skip = 1; + ceph_msg_put(m); + m = NULL; + goto out; + } + m->pages = req->r_data.pages; + m->page_count = req->r_data.num_pages; + m->page_alignment = req->r_data.alignment; #ifdef CONFIG_BLOCK - m->bio = req->r_data.bio; + } else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) { + m->bio = req->r_data.bio; #endif + } } *skip = 0; req->r_con_filling_msg = con->ops->get(con); -- cgit v1.2.3 From 0fff87ec798abdb4a99f01cbb0197266bb68c5dc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: separate read and write data An osd request defines information about where data to be read should be placed as well as where data to write comes from. Currently these are represented by common fields. Keep information about data for writing separate from data to be read by splitting these into data_in and data_out fields. This is the key patch in this whole series, in that it actually identifies which osd requests generate outgoing data and which generate incoming data. It's less obvious (currently) that an osd CALL op generates both outgoing and incoming data; that's the focus of some upcoming work. This resolves: http://tracker.ceph.com/issues/4127 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 +++++---- fs/ceph/addr.c | 67 ++++++++++++++++++--------------- fs/ceph/file.c | 10 ++--- include/linux/ceph/osd_client.h | 5 ++- net/ceph/osd_client.c | 83 +++++++++++++++++++++++++---------------- 5 files changed, 105 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f189bc2909b0..3f69eb1bc656 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1398,6 +1398,7 @@ static struct ceph_osd_request *rbd_osd_req_create( struct ceph_snap_context *snapc = NULL; struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; + struct ceph_osd_data *osd_data; struct timespec now; struct timespec *mtime; u64 snap_id = CEPH_NOSNAP; @@ -1418,6 +1419,7 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); if (!osd_req) return NULL; /* ENOMEM */ + osd_data = write_request ? &osd_req->r_data_out : &osd_req->r_data_in; rbd_assert(obj_request_type_valid(obj_request->type)); switch (obj_request->type) { @@ -1425,16 +1427,16 @@ static struct ceph_osd_request *rbd_osd_req_create( break; /* Nothing to do */ case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); - osd_req->r_data.type = CEPH_OSD_DATA_TYPE_BIO; - osd_req->r_data.bio = obj_request->bio_list; + osd_data->type = CEPH_OSD_DATA_TYPE_BIO; + osd_data->bio = obj_request->bio_list; break; case OBJ_REQUEST_PAGES: - osd_req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - osd_req->r_data.pages = obj_request->pages; - osd_req->r_data.num_pages = obj_request->page_count; - osd_req->r_data.alignment = offset & ~PAGE_MASK; - osd_req->r_data.pages_from_pool = false; - osd_req->r_data.own_pages = false; + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = obj_request->pages; + osd_data->num_pages = obj_request->page_count; + osd_data->alignment = offset & ~PAGE_MASK; + osd_data->pages_from_pool = false; + osd_data->own_pages = false; break; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 276fe96f12e3..c117c51741d5 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -243,9 +243,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES); - for (i = 0; i < req->r_data.num_pages; i++, bytes -= PAGE_CACHE_SIZE) { - struct page *page = req->r_data.pages[i]; + BUG_ON(req->r_data_in.type != CEPH_OSD_DATA_TYPE_PAGES); + for (i = 0; i < req->r_data_in.num_pages; i++) { + struct page *page = req->r_data_in.pages[i]; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ @@ -258,8 +258,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) SetPageUptodate(page); unlock_page(page); page_cache_release(page); + bytes -= PAGE_CACHE_SIZE; } - kfree(req->r_data.pages); + kfree(req->r_data_in.pages); } static void ceph_unlock_page_vector(struct page **pages, int num_pages) @@ -337,10 +338,10 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.pages = pages; - req->r_data.num_pages = nr_pages; - req->r_data.alignment = 0; + req->r_data_in.type = CEPH_OSD_DATA_TYPE_PAGES; + req->r_data_in.pages = pages; + req->r_data_in.num_pages = nr_pages; + req->r_data_in.alignment = 0; req->r_callback = finish_read; req->r_inode = inode; @@ -563,7 +564,7 @@ static void writepages_finish(struct ceph_osd_request *req, long writeback_stat; unsigned issued = ceph_caps_issued(ci); - BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES); + BUG_ON(req->r_data_out.type != CEPH_OSD_DATA_TYPE_PAGES); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -571,7 +572,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_data.num_pages; + wrote = req->r_data_out.num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -580,8 +581,8 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_data.num_pages; i++) { - page = req->r_data.pages[i]; + for (i = 0; i < req->r_data_out.num_pages; i++) { + page = req->r_data_out.pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -610,31 +611,34 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_data.num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, req->r_data_out.num_pages, snapc); - ceph_release_pages(req->r_data.pages, req->r_data.num_pages); - if (req->r_data.pages_from_pool) - mempool_free(req->r_data.pages, + ceph_release_pages(req->r_data_out.pages, req->r_data_out.num_pages); + if (req->r_data_out.pages_from_pool) + mempool_free(req->r_data_out.pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else - kfree(req->r_data.pages); + kfree(req->r_data_out.pages); ceph_osdc_put_request(req); } /* * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_data.num_pages + * mempool. we avoid the mempool if we can because req->r_data_out.num_pages * may be less than the maximum write size. */ static void alloc_page_vec(struct ceph_fs_client *fsc, struct ceph_osd_request *req) { - req->r_data.pages = kmalloc(sizeof(struct page *) * req->r_data.num_pages, - GFP_NOFS); - if (!req->r_data.pages) { - req->r_data.pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); - req->r_data.pages_from_pool = 1; - WARN_ON(!req->r_data.pages); + size_t size; + + size = sizeof (struct page *) * req->r_data_out.num_pages; + req->r_data_out.pages = kmalloc(size, GFP_NOFS); + if (!req->r_data_out.pages) { + req->r_data_out.pages = mempool_alloc(fsc->wb_pagevec_pool, + GFP_NOFS); + req->r_data_out.pages_from_pool = 1; + WARN_ON(!req->r_data_out.pages); } } @@ -833,10 +837,11 @@ get_more_pages: break; } - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.num_pages = calc_pages_for(0, len); - req->r_data.alignment = 0; - max_pages = req->r_data.num_pages; + req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; + req->r_data_out.num_pages = + calc_pages_for(0, len); + req->r_data_out.alignment = 0; + max_pages = req->r_data_out.num_pages; alloc_page_vec(fsc, req); req->r_callback = writepages_finish; @@ -858,7 +863,7 @@ get_more_pages: } set_page_writeback(page); - req->r_data.pages[locked_pages] = page; + req->r_data_out.pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -888,14 +893,14 @@ get_more_pages: } /* submit the write */ - offset = req->r_data.pages[0]->index << PAGE_CACHE_SHIFT; + offset = req->r_data_out.pages[0]->index << PAGE_CACHE_SHIFT; len = min((snap_size ? snap_size : i_size_read(inode)) - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); /* revise final length, page count */ - req->r_data.num_pages = locked_pages; + req->r_data_out.num_pages = locked_pages; req->r_request_ops[0].extent.length = cpu_to_le64(len); req->r_request_ops[0].payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3643a386ab23..501fb37b81a2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -568,13 +568,13 @@ more: if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */ req->r_safe_callback = sync_write_commit; - req->r_data.own_pages = 1; + req->r_data_out.own_pages = 1; } } - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.pages = pages; - req->r_data.num_pages = num_pages; - req->r_data.alignment = page_align; + req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; + req->r_data_out.pages = pages; + req->r_data_out.num_pages = num_pages; + req->r_data_out.alignment = page_align; req->r_inode = inode; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 56604b33dc3c..40e02603723d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -130,8 +130,9 @@ struct ceph_osd_request { struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ - struct ceph_osd_data r_data; - struct ceph_pagelist r_trail; /* trailing part of the data */ + struct ceph_osd_data r_data_in; + struct ceph_osd_data r_data_out; + struct ceph_pagelist r_trail; /* trailing part of data out */ }; struct ceph_osd_event { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 591e1b0cccbe..f9cf44504484 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -122,10 +122,16 @@ void ceph_osdc_release_request(struct kref *kref) } if (req->r_reply) ceph_msg_put(req->r_reply); - if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES && - req->r_data.own_pages) - ceph_release_page_vector(req->r_data.pages, - req->r_data.num_pages); + + if (req->r_data_in.type == CEPH_OSD_DATA_TYPE_PAGES && + req->r_data_in.own_pages) + ceph_release_page_vector(req->r_data_in.pages, + req->r_data_in.num_pages); + if (req->r_data_out.type == CEPH_OSD_DATA_TYPE_PAGES && + req->r_data_out.own_pages) + ceph_release_page_vector(req->r_data_out.pages, + req->r_data_out.num_pages); + ceph_put_snap_context(req->r_snapc); ceph_pagelist_release(&req->r_trail); if (req->r_mempool) @@ -189,7 +195,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; - req->r_data.type = CEPH_OSD_DATA_TYPE_NONE; + req->r_data_in.type = CEPH_OSD_DATA_TYPE_NONE; + req->r_data_out.type = CEPH_OSD_DATA_TYPE_NONE; ceph_pagelist_init(&req->r_trail); /* create request message; allow space for oid */ @@ -1740,17 +1747,21 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, bool nofail) { int rc = 0; + struct ceph_osd_data *osd_data; + + /* Set up outgoing data */ - if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) { - req->r_request->pages = req->r_data.pages; - req->r_request->page_count = req->r_data.num_pages; - req->r_request->page_alignment = req->r_data.alignment; + osd_data = &req->r_data_out; + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + req->r_request->pages = osd_data->pages; + req->r_request->page_count = osd_data->num_pages; + req->r_request->page_alignment = osd_data->alignment; #ifdef CONFIG_BLOCK - } else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) { - req->r_request->bio = req->r_data.bio; + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { + req->r_request->bio = osd_data->bio; #endif } else { - pr_err("unknown request data type %d\n", req->r_data.type); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } req->r_request->trail = &req->r_trail; @@ -1939,6 +1950,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct page **pages, int num_pages, int page_align) { struct ceph_osd_request *req; + struct ceph_osd_data *osd_data; int rc = 0; dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, @@ -1951,13 +1963,15 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, return PTR_ERR(req); /* it may be a short read due to an object boundary */ - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.pages = pages; - req->r_data.num_pages = calc_pages_for(page_align, *plen); - req->r_data.alignment = page_align; + + osd_data = &req->r_data_in; + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->num_pages = calc_pages_for(page_align, *plen); + osd_data->alignment = page_align; dout("readpages final extent is %llu~%llu (%d pages align %d)\n", - off, *plen, req->r_data.num_pages, page_align); + off, *plen, osd_data->num_pages, page_align); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -1981,6 +1995,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct page **pages, int num_pages) { struct ceph_osd_request *req; + struct ceph_osd_data *osd_data; int rc = 0; int page_align = off & ~PAGE_MASK; @@ -1995,11 +2010,13 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.pages = pages; - req->r_data.num_pages = calc_pages_for(page_align, len); - req->r_data.alignment = page_align; - dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_data.num_pages); + osd_data = &req->r_data_out; + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->num_pages = calc_pages_for(page_align, len); + osd_data->alignment = page_align; + dout("writepages %llu~%llu (%d pages)\n", off, len, + osd_data->num_pages); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) @@ -2092,28 +2109,30 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) { + struct ceph_osd_data *osd_data = &req->r_data_in; + + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { int want; - want = calc_pages_for(req->r_data.alignment, data_len); - if (req->r_data.pages && - unlikely(req->r_data.num_pages < want)) { + want = calc_pages_for(osd_data->alignment, data_len); + if (osd_data->pages && + unlikely(osd_data->num_pages < want)) { pr_warning("tid %lld reply has %d bytes %d " "pages, we had only %d pages ready\n", tid, data_len, want, - req->r_data.num_pages); + osd_data->num_pages); *skip = 1; ceph_msg_put(m); m = NULL; goto out; } - m->pages = req->r_data.pages; - m->page_count = req->r_data.num_pages; - m->page_alignment = req->r_data.alignment; + m->pages = osd_data->pages; + m->page_count = osd_data->num_pages; + m->page_alignment = osd_data->alignment; #ifdef CONFIG_BLOCK - } else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) { - m->bio = req->r_data.bio; + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { + m->bio = osd_data->bio; #endif } } -- cgit v1.2.3 From 4137577ae398837b0d5e47d4d9365320584efdad Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 5 Mar 2013 09:25:10 -0600 Subject: libceph: clean up skipped message logic In ceph_con_in_msg_alloc() it is possible for a connection's alloc_msg method to indicate an incoming message should be skipped. By default, read_partial_message() initializes the skip variable to 0 before it gets provided to ceph_con_in_msg_alloc(). The osd client, mon client, and mds client each supply an alloc_msg method. The mds client always assigns skip to be 0. The other two leave the skip value of as-is, or assigns it to zero, except: - if no (osd or mon) request having the given tid is found, in which case skip is set to 1 and NULL is returned; or - in the osd client, if the data of the reply message is not adequate to hold the message to be read, it assigns skip value 1 and returns NULL. So the returned message pointer will always be NULL if skip is ever non-zero. Clean up the logic a bit in ceph_con_in_msg_alloc() to make this state of affairs more obvious. Add a comment explaining how a null message pointer can mean either a message that should be skipped or a problem allocating a message. This resolves: http://tracker.ceph.com/issues/4324 Reported-by: Greg Farnum Signed-off-by: Alex Elder Reviewed-by: Greg Farnum --- net/ceph/messenger.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index c7d427876dbc..af0c35d40048 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2819,18 +2819,21 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) ceph_msg_put(msg); return -EAGAIN; } - con->in_msg = msg; - if (con->in_msg) { + if (msg) { + BUG_ON(*skip); + con->in_msg = msg; con->in_msg->con = con->ops->get(con); BUG_ON(con->in_msg->con == NULL); - } - if (*skip) { - con->in_msg = NULL; - return 0; - } - if (!con->in_msg) { - con->error_msg = - "error allocating memory for incoming message"; + } else { + /* + * Null message pointer means either we should skip + * this message or we couldn't allocate memory. The + * former is not an error. + */ + if (*skip) + return 0; + con->error_msg = "error allocating memory for incoming message"; + return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); -- cgit v1.2.3 From 7b11ba37585595034a91df8869414f732466b800 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 18:51:03 -0600 Subject: libceph: define CEPH_MSG_MAX_MIDDLE_LEN This is probably unnecessary but the code read as if it were wrong in read_partial_message(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/libceph.h | 1 + net/ceph/messenger.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 29818fc3fa49..5493d7b86423 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -66,6 +66,7 @@ struct ceph_options { #define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) +#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) #define CEPH_AUTH_NAME_DEFAULT "guest" diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index af0c35d40048..b8d0da56d610 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1887,7 +1887,7 @@ static int read_partial_message(struct ceph_connection *con) if (front_len > CEPH_MSG_MAX_FRONT_LEN) return -EIO; middle_len = le32_to_cpu(con->in_hdr.middle_len); - if (middle_len > CEPH_MSG_MAX_DATA_LEN) + if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) return -EIO; data_len = le32_to_cpu(con->in_hdr.data_len); if (data_len > CEPH_MSG_MAX_DATA_LEN) -- cgit v1.2.3 From 6ebc8b32b327463f552d9d4499aba2ef1e02a600 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 18:51:03 -0600 Subject: libceph: minor byte order problems in read_partial_message() Some values printed are not (necessarily) in CPU order. We already have a copy of the converted versions, so use them. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b8d0da56d610..d9ace979adef 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1916,7 +1916,7 @@ static int read_partial_message(struct ceph_connection *con) int skip = 0; dout("got hdr type %d front %d data %d\n", con->in_hdr.type, - con->in_hdr.front_len, con->in_hdr.data_len); + front_len, data_len); ret = ceph_con_in_msg_alloc(con, &skip); if (ret < 0) return ret; -- cgit v1.2.3 From e1dcb128f88958e7212fdd7ceebba4f84d6bc47a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:38 -0600 Subject: libceph: change type of ceph_tcp_sendpage() "more" Change the type of the "more" parameter from int to bool. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d9ace979adef..962b2cd10f43 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -493,7 +493,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, } static int ceph_tcp_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int more) + int offset, size_t size, bool more) { int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); int ret; @@ -1132,7 +1132,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) } ret = ceph_tcp_sendpage(con->sock, page, con->out_msg_pos.page_pos + bio_offset, - len, 1); + len, true); if (ret <= 0) goto out; @@ -1161,7 +1161,7 @@ static int write_partial_skip(struct ceph_connection *con) while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); - ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); + ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); if (ret <= 0) goto out; con->out_skip -= ret; -- cgit v1.2.3 From b3d56fab333bbb3ac7300843d69e52d7bd8a016b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 18:51:03 -0600 Subject: libceph: kill args in read_partial_message_bio() There is only one caller for read_partial_message_bio(), and it always passes &msg->bio_iter and &bio_seg as the second and third arguments. Furthermore, the message in question is always the connection's in_msg, and we can get that inside the called function. So drop those two parameters and use their derived equivalents. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 962b2cd10f43..2017b8833baa 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1819,14 +1819,16 @@ static int read_partial_message_pages(struct ceph_connection *con, #ifdef CONFIG_BLOCK static int read_partial_message_bio(struct ceph_connection *con, - struct bio **bio_iter, - unsigned int *bio_seg, unsigned int data_len, bool do_datacrc) { - struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); + struct ceph_msg *msg = con->in_msg; + struct bio_vec *bv; void *p; int ret, left; + BUG_ON(!msg); + BUG_ON(!msg->bio_iter); + bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); left = min((int)(data_len - con->in_msg_pos.data_pos), (int)(bv->bv_len - con->in_msg_pos.page_pos)); @@ -1845,7 +1847,7 @@ static int read_partial_message_bio(struct ceph_connection *con, con->in_msg_pos.page_pos += ret; if (con->in_msg_pos.page_pos == bv->bv_len) { con->in_msg_pos.page_pos = 0; - iter_bio_next(bio_iter, bio_seg); + iter_bio_next(&msg->bio_iter, &msg->bio_seg); } return ret; @@ -1975,9 +1977,7 @@ static int read_partial_message(struct ceph_connection *con) return ret; #ifdef CONFIG_BLOCK } else if (m->bio) { - BUG_ON(!m->bio_iter); ret = read_partial_message_bio(con, - &m->bio_iter, &m->bio_seg, data_len, do_datacrc); if (ret <= 0) return ret; -- cgit v1.2.3 From e788182fa6c1a400076278a75d0efa0a8a08e4ec Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 18:51:04 -0600 Subject: libceph: define and use in_msg_pos_next() Define a new function in_msg_pos_next() to match out_msg_pos_next(), and use it in place of code at the end of read_partial_message_pages() and read_partial_message_bio(). Note that the page number is incremented and offset reset under slightly different conditions from before. The result is equivalent, however, as explained below. Each time an incoming message is going to arrive, we find out how much room is left--not surpassing the current page--and provide that as the number of bytes to receive. So the amount we'll use is the lesser of: all that's left of the entire request; and all that's left in the current page. If we received exactly how many were requested, we either reached the end of the request or the end of the page. In the first case, we're done, in the second, we move onto the next page in the array. In all cases but (possibly) on the last page, after adding the number of bytes received, page_pos == PAGE_SIZE. On the last page, it doesn't really matter whether we increment the page number and reset the page position, because we're done and we won't come back here again. The code previously skipped over that last case, basically. The new code handles that case the same as the others, incrementing and resetting. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 57 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2017b8833baa..fb5f6e7d57a3 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1052,6 +1052,28 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, #endif } +static void in_msg_pos_next(struct ceph_connection *con, size_t len, + size_t received) +{ + struct ceph_msg *msg = con->in_msg; + + BUG_ON(!msg); + BUG_ON(!received); + + con->in_msg_pos.data_pos += received; + con->in_msg_pos.page_pos += received; + if (received < len) + return; + + BUG_ON(received != len); + con->in_msg_pos.page_pos = 0; + con->in_msg_pos.page++; +#ifdef CONFIG_BLOCK + if (msg->bio) + iter_bio_next(&msg->bio_iter, &msg->bio_seg); +#endif /* CONFIG_BLOCK */ +} + /* * Write as much message data payload as we can. If we finish, queue * up the footer. @@ -1789,6 +1811,7 @@ static int read_partial_message_pages(struct ceph_connection *con, struct page **pages, unsigned int data_len, bool do_datacrc) { + struct page *page; void *p; int ret; int left; @@ -1797,22 +1820,18 @@ static int read_partial_message_pages(struct ceph_connection *con, (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); /* (page) data */ BUG_ON(pages == NULL); - p = kmap(pages[con->in_msg_pos.page]); - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); + page = pages[con->in_msg_pos.page]; + p = kmap(page); + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, left); if (ret > 0 && do_datacrc) con->in_data_crc = crc32c(con->in_data_crc, p + con->in_msg_pos.page_pos, ret); - kunmap(pages[con->in_msg_pos.page]); + kunmap(page); if (ret <= 0) return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == PAGE_SIZE) { - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.page++; - } + + in_msg_pos_next(con, left, ret); return ret; } @@ -1823,32 +1842,30 @@ static int read_partial_message_bio(struct ceph_connection *con, { struct ceph_msg *msg = con->in_msg; struct bio_vec *bv; + struct page *page; void *p; int ret, left; BUG_ON(!msg); BUG_ON(!msg->bio_iter); bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); + left = min((int)(data_len - con->in_msg_pos.data_pos), (int)(bv->bv_len - con->in_msg_pos.page_pos)); - p = kmap(bv->bv_page) + bv->bv_offset; + page = bv->bv_page; + p = kmap(page) + bv->bv_offset; - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, left); if (ret > 0 && do_datacrc) con->in_data_crc = crc32c(con->in_data_crc, p + con->in_msg_pos.page_pos, ret); - kunmap(bv->bv_page); + kunmap(page); if (ret <= 0) return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == bv->bv_len) { - con->in_msg_pos.page_pos = 0; - iter_bio_next(&msg->bio_iter, &msg->bio_seg); - } + + in_msg_pos_next(con, left, ret); return ret; } -- cgit v1.2.3 From 35c7bfbcd4fabded090e5ab316a1cbf053a0a980 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:38 -0600 Subject: libceph: advance pagelist with list_rotate_left() While processing an outgoing pagelist (either the data pagelist or trail) in a ceph message, the messenger cycles through each of the pages on the list. This is accomplished in out_msg_pos_next(), if the end of the first page on the list is reached, the first page is moved to the end of the list. There is a list operation, list_rotate_left(), which performs exactly this operation, and by using it, what's really going on becomes more obvious. So replace these two list_move_tail() calls with list_rotate_left(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index fb5f6e7d57a3..2734d0337f95 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1041,11 +1041,9 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, con->out_msg_pos.page++; con->out_msg_pos.did_page_crc = false; if (in_trail) - list_move_tail(&page->lru, - &msg->trail->head); + list_rotate_left(&msg->trail->head); else if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); + list_rotate_left(&msg->pagelist->head); #ifdef CONFIG_BLOCK else if (msg->bio) iter_bio_next(&msg->bio_iter, &msg->bio_seg); -- cgit v1.2.3 From 9516e45b25d9967c35d2e798496ec5e590aaa24f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:16 -0600 Subject: libceph: simplify new message initialization Rather than explicitly initializing many fields to 0, NULL, or false in a newly-allocated message, just use kzalloc() for allocating new messages. This will become a much more convenient way of doing things anyway for upcoming patches that abstract the data field. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 38 ++++---------------------------------- 1 file changed, 4 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2734d0337f95..ce1669f75ca5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2699,49 +2699,19 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, { struct ceph_msg *m; - m = kmalloc(sizeof(*m), flags); + m = kzalloc(sizeof(*m), flags); if (m == NULL) goto out; - kref_init(&m->kref); - - m->con = NULL; - INIT_LIST_HEAD(&m->list_head); - m->hdr.tid = 0; m->hdr.type = cpu_to_le16(type); m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); - m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); - m->hdr.middle_len = 0; - m->hdr.data_len = 0; - m->hdr.data_off = 0; - m->hdr.reserved = 0; - m->footer.front_crc = 0; - m->footer.middle_crc = 0; - m->footer.data_crc = 0; - m->footer.flags = 0; - m->front_max = front_len; - m->front_is_vmalloc = false; - m->more_to_follow = false; - m->ack_stamp = 0; - m->pool = NULL; - /* middle */ - m->middle = NULL; - - /* data */ - m->page_count = 0; - m->page_alignment = 0; - m->pages = NULL; - m->pagelist = NULL; -#ifdef CONFIG_BLOCK - m->bio = NULL; - m->bio_iter = NULL; - m->bio_seg = 0; -#endif /* CONFIG_BLOCK */ - m->trail = NULL; + INIT_LIST_HEAD(&m->list_head); + kref_init(&m->kref); /* front */ + m->front_max = front_len; if (front_len) { if (front_len > PAGE_CACHE_SIZE) { m->front.iov_base = __vmalloc(front_len, flags, -- cgit v1.2.3 From e0c594878e3211b09208c779df5f996f0b831d9e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Mar 2013 15:38:25 -0600 Subject: libceph: record byte count not page count Record the byte count for an osd request rather than the page count. The number of pages can always be derived from the byte count (and alignment/offset) but the reverse is not true. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- fs/ceph/addr.c | 33 ++++++++++++++++----------- fs/ceph/file.c | 2 +- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 50 ++++++++++++++++++++++++----------------- 5 files changed, 52 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3f69eb1bc656..04cd5fdfc8f3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1433,7 +1433,7 @@ static struct ceph_osd_request *rbd_osd_req_create( case OBJ_REQUEST_PAGES: osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; osd_data->pages = obj_request->pages; - osd_data->num_pages = obj_request->page_count; + osd_data->length = obj_request->length; osd_data->alignment = offset & ~PAGE_MASK; osd_data->pages_from_pool = false; osd_data->own_pages = false; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c117c51741d5..45745aae4786 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -238,13 +238,16 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) struct inode *inode = req->r_inode; int rc = req->r_result; int bytes = le32_to_cpu(msg->hdr.data_len); + int num_pages; int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ BUG_ON(req->r_data_in.type != CEPH_OSD_DATA_TYPE_PAGES); - for (i = 0; i < req->r_data_in.num_pages; i++) { + num_pages = calc_pages_for((u64)req->r_data_in.alignment, + (u64)req->r_data_in.length); + for (i = 0; i < num_pages; i++) { struct page *page = req->r_data_in.pages[i]; if (bytes < (int)PAGE_CACHE_SIZE) { @@ -340,7 +343,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } req->r_data_in.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data_in.pages = pages; - req->r_data_in.num_pages = nr_pages; + req->r_data_in.length = len; req->r_data_in.alignment = 0; req->r_callback = finish_read; req->r_inode = inode; @@ -555,6 +558,7 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_inode_info *ci = ceph_inode(inode); unsigned wrote; struct page *page; + int num_pages; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; @@ -565,6 +569,8 @@ static void writepages_finish(struct ceph_osd_request *req, unsigned issued = ceph_caps_issued(ci); BUG_ON(req->r_data_out.type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)req->r_data_out.alignment, + (u64)req->r_data_out.length); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -572,7 +578,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_data_out.num_pages; + wrote = num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -581,7 +587,7 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_data_out.num_pages; i++) { + for (i = 0; i < num_pages; i++) { page = req->r_data_out.pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -611,9 +617,9 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_data_out.num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(req->r_data_out.pages, req->r_data_out.num_pages); + ceph_release_pages(req->r_data_out.pages, num_pages); if (req->r_data_out.pages_from_pool) mempool_free(req->r_data_out.pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); @@ -624,15 +630,18 @@ static void writepages_finish(struct ceph_osd_request *req, /* * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_data_out.num_pages + * mempool. we avoid the mempool if we can because req->r_data_out.length * may be less than the maximum write size. */ static void alloc_page_vec(struct ceph_fs_client *fsc, struct ceph_osd_request *req) { size_t size; + int num_pages; - size = sizeof (struct page *) * req->r_data_out.num_pages; + num_pages = calc_pages_for((u64)req->r_data_out.alignment, + (u64)req->r_data_out.length); + size = sizeof (struct page *) * num_pages; req->r_data_out.pages = kmalloc(size, GFP_NOFS); if (!req->r_data_out.pages) { req->r_data_out.pages = mempool_alloc(fsc->wb_pagevec_pool, @@ -838,11 +847,9 @@ get_more_pages: } req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data_out.num_pages = - calc_pages_for(0, len); + req->r_data_out.length = len; req->r_data_out.alignment = 0; - max_pages = req->r_data_out.num_pages; - + max_pages = calc_pages_for(0, (u64)len); alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; @@ -900,7 +907,7 @@ get_more_pages: locked_pages, offset, len); /* revise final length, page count */ - req->r_data_out.num_pages = locked_pages; + req->r_data_out.length = len; req->r_request_ops[0].extent.length = cpu_to_le64(len); req->r_request_ops[0].payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 501fb37b81a2..0ac6e159bdc6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -573,7 +573,7 @@ more: } req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data_out.pages = pages; - req->r_data_out.num_pages = num_pages; + req->r_data_out.length = len; req->r_data_out.alignment = page_align; req->r_inode = inode; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 40e02603723d..a8016dfbfdba 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -63,7 +63,7 @@ struct ceph_osd_data { union { struct { struct page **pages; - u32 num_pages; + u64 length; u32 alignment; bool pages_from_pool; bool own_pages; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f9cf44504484..202af14dc6dc 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -107,6 +107,7 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, */ void ceph_osdc_release_request(struct kref *kref) { + int num_pages; struct ceph_osd_request *req = container_of(kref, struct ceph_osd_request, r_kref); @@ -124,13 +125,17 @@ void ceph_osdc_release_request(struct kref *kref) ceph_msg_put(req->r_reply); if (req->r_data_in.type == CEPH_OSD_DATA_TYPE_PAGES && - req->r_data_in.own_pages) - ceph_release_page_vector(req->r_data_in.pages, - req->r_data_in.num_pages); + req->r_data_in.own_pages) { + num_pages = calc_pages_for((u64)req->r_data_in.alignment, + (u64)req->r_data_in.length); + ceph_release_page_vector(req->r_data_in.pages, num_pages); + } if (req->r_data_out.type == CEPH_OSD_DATA_TYPE_PAGES && - req->r_data_out.own_pages) - ceph_release_page_vector(req->r_data_out.pages, - req->r_data_out.num_pages); + req->r_data_out.own_pages) { + num_pages = calc_pages_for((u64)req->r_data_out.alignment, + (u64)req->r_data_out.length); + ceph_release_page_vector(req->r_data_out.pages, num_pages); + } ceph_put_snap_context(req->r_snapc); ceph_pagelist_release(&req->r_trail); @@ -1753,8 +1758,12 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, osd_data = &req->r_data_out; if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + unsigned int page_count; + req->r_request->pages = osd_data->pages; - req->r_request->page_count = osd_data->num_pages; + page_count = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + req->r_request->page_count = page_count; req->r_request->page_alignment = osd_data->alignment; #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { @@ -1967,11 +1976,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, osd_data = &req->r_data_in; osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; osd_data->pages = pages; - osd_data->num_pages = calc_pages_for(page_align, *plen); + osd_data->length = *plen; osd_data->alignment = page_align; - dout("readpages final extent is %llu~%llu (%d pages align %d)\n", - off, *plen, osd_data->num_pages, page_align); + dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", + off, *plen, osd_data->length, page_align); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -2013,10 +2022,9 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, osd_data = &req->r_data_out; osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; osd_data->pages = pages; - osd_data->num_pages = calc_pages_for(page_align, len); + osd_data->length = len; osd_data->alignment = page_align; - dout("writepages %llu~%llu (%d pages)\n", off, len, - osd_data->num_pages); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, osd_data->length); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) @@ -2112,23 +2120,23 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_osd_data *osd_data = &req->r_data_in; if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - int want; + unsigned int page_count; - want = calc_pages_for(osd_data->alignment, data_len); if (osd_data->pages && - unlikely(osd_data->num_pages < want)) { + unlikely(osd_data->length < data_len)) { - pr_warning("tid %lld reply has %d bytes %d " - "pages, we had only %d pages ready\n", - tid, data_len, want, - osd_data->num_pages); + pr_warning("tid %lld reply has %d bytes " + "we had only %llu bytes ready\n", + tid, data_len, osd_data->length); *skip = 1; ceph_msg_put(m); m = NULL; goto out; } + page_count = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); m->pages = osd_data->pages; - m->page_count = osd_data->num_pages; + m->page_count = page_count; m->page_alignment = osd_data->alignment; #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { -- cgit v1.2.3 From 02afca6ca00b7972887c5cc77068356f33bdfc18 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: isolate message page field manipulation Define a function ceph_msg_data_set_pages(), which more clearly abstracts the assignment page-related fields for data in a ceph message structure. Use this new function in the osd client and mds client. Ideally, these fields would never be set more than once (with BUG_ON() calls to guarantee that). At the moment though the osd client sets these every time it receives a message, and in the event of a communication problem this can happen more than once. (This will be resolved shortly, but setting up these helpers first makes it all a bit easier to work with.) Rearrange the field order in a ceph_msg structure to group those that are used to define the possible data payloads. This partially resolves: http://tracker.ceph.com/issues/4263 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/mds_client.c | 4 ++-- include/linux/ceph/messenger.h | 22 +++++++++++++--------- net/ceph/messenger.c | 11 +++++++++++ net/ceph/osd_client.c | 10 ++++------ 4 files changed, 30 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ecfb738bca30..90198a407023 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1721,8 +1721,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - msg->pages = req->r_pages; - msg->page_count = req->r_num_pages; + ceph_msg_data_set_pages(msg, req->r_pages, req->r_num_pages, 0); + msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 6c118748a7f8..aa463b9b30af 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -74,21 +74,22 @@ struct ceph_msg { struct ceph_msg_footer footer; /* footer */ struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; - struct page **pages; /* data payload. NOT OWNER. */ - unsigned page_count; /* size of page array */ - unsigned page_alignment; /* io offset in first page */ - struct ceph_pagelist *pagelist; /* instead of pages */ - - struct ceph_connection *con; - struct list_head list_head; - struct kref kref; + struct page **pages; /* data payload. NOT OWNER. */ + unsigned int page_alignment; /* io offset in first page */ + unsigned int page_count; /* # pages in array or list */ + struct ceph_pagelist *pagelist; /* instead of pages */ #ifdef CONFIG_BLOCK + unsigned int bio_seg; /* current bio segment */ struct bio *bio; /* instead of pages/pagelist */ struct bio *bio_iter; /* bio iterator */ - unsigned int bio_seg; /* current bio segment */ #endif /* CONFIG_BLOCK */ struct ceph_pagelist *trail; /* the trailing part of the data */ + + struct ceph_connection *con; + struct list_head list_head; /* links for connection lists */ + + struct kref kref; bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; @@ -218,6 +219,9 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); extern void ceph_con_keepalive(struct ceph_connection *con); +extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, + unsigned int page_count, size_t alignment); + extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); extern void ceph_msg_kfree(struct ceph_msg *m); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ce1669f75ca5..cec39cb623f0 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2689,6 +2689,17 @@ void ceph_con_keepalive(struct ceph_connection *con) } EXPORT_SYMBOL(ceph_con_keepalive); +void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, + unsigned int page_count, size_t alignment) +{ + /* BUG_ON(msg->pages); */ + /* BUG_ON(msg->page_count); */ + + msg->pages = pages; + msg->page_count = page_count; + msg->page_alignment = alignment & ~PAGE_MASK; +} +EXPORT_SYMBOL(ceph_msg_data_set_pages); /* * construct a new message with given type, size diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 202af14dc6dc..a09d57134075 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1760,11 +1760,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { unsigned int page_count; - req->r_request->pages = osd_data->pages; page_count = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); - req->r_request->page_count = page_count; - req->r_request->page_alignment = osd_data->alignment; + ceph_msg_data_set_pages(req->r_request, osd_data->pages, + page_count, osd_data->alignment); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { req->r_request->bio = osd_data->bio; @@ -2135,9 +2134,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } page_count = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); - m->pages = osd_data->pages; - m->page_count = page_count; - m->page_alignment = osd_data->alignment; + ceph_msg_data_set_pages(m, osd_data->pages, + osd_data->num_pages, osd_data->alignment); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { m->bio = osd_data->bio; -- cgit v1.2.3 From f1baeb2b9fc1c2c87ec02f1bf8cb88e108d4fbce Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Mar 2013 15:38:26 -0600 Subject: libceph: set page info with byte length When setting page array information for message data, provide the byte length rather than the page count ceph_msg_data_set_pages(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/mds_client.c | 2 +- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 4 ++-- net/ceph/osd_client.c | 14 ++++---------- 4 files changed, 8 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 90198a407023..03eb943ebce5 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1721,7 +1721,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_msg_data_set_pages(msg, req->r_pages, req->r_num_pages, 0); + ceph_msg_data_set_pages(msg, req->r_pages, req->r_data_len, 0); msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index aa463b9b30af..e6d20e892a88 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -220,7 +220,7 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); extern void ceph_con_keepalive(struct ceph_connection *con); extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, - unsigned int page_count, size_t alignment); + size_t length, size_t alignment); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index cec39cb623f0..fc59fcc9be77 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2690,13 +2690,13 @@ void ceph_con_keepalive(struct ceph_connection *con) EXPORT_SYMBOL(ceph_con_keepalive); void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, - unsigned int page_count, size_t alignment) + size_t length, size_t alignment) { /* BUG_ON(msg->pages); */ /* BUG_ON(msg->page_count); */ msg->pages = pages; - msg->page_count = page_count; + msg->page_count = calc_pages_for((u64)alignment, (u64)length); msg->page_alignment = alignment & ~PAGE_MASK; } EXPORT_SYMBOL(ceph_msg_data_set_pages); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a09d57134075..f29bedac7310 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1758,12 +1758,9 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, osd_data = &req->r_data_out; if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - unsigned int page_count; - - page_count = calc_pages_for((u64)osd_data->alignment, - (u64)osd_data->length); + BUG_ON(osd_data->length > (u64) SIZE_MAX); ceph_msg_data_set_pages(req->r_request, osd_data->pages, - page_count, osd_data->alignment); + osd_data->length, osd_data->alignment); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { req->r_request->bio = osd_data->bio; @@ -2119,8 +2116,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_osd_data *osd_data = &req->r_data_in; if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - unsigned int page_count; - if (osd_data->pages && unlikely(osd_data->length < data_len)) { @@ -2132,10 +2127,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = NULL; goto out; } - page_count = calc_pages_for((u64)osd_data->alignment, - (u64)osd_data->length); + BUG_ON(osd_data->length > (u64) SIZE_MAX); ceph_msg_data_set_pages(m, osd_data->pages, - osd_data->num_pages, osd_data->alignment); + osd_data->length, osd_data->alignment); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { m->bio = osd_data->bio; -- cgit v1.2.3 From 27fa83852ba275361eaa1a1283cf6704fa8191a6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: isolate other message data fields Define ceph_msg_data_set_pagelist(), ceph_msg_data_set_bio(), and ceph_msg_data_set_trail() to clearly abstract the assignment of the remaining data-related fields in a ceph message structure. Use the new functions in the osd client and mds client. This partially resolves: http://tracker.ceph.com/issues/4263 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/mds_client.c | 2 +- include/linux/ceph/messenger.h | 5 +++++ net/ceph/messenger.c | 28 ++++++++++++++++++++++++++++ net/ceph/osd_client.c | 6 +++--- 4 files changed, 37 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 03eb943ebce5..3b2aa8702ae0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2603,7 +2603,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail; } - reply->pagelist = pagelist; + ceph_msg_data_set_pagelist(reply, pagelist); if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); reply->hdr.data_len = cpu_to_le32(pagelist->length); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index e6d20e892a88..9d9be4682ac3 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -221,6 +221,11 @@ extern void ceph_con_keepalive(struct ceph_connection *con); extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment); +extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, + struct ceph_pagelist *pagelist); +extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio); +extern void ceph_msg_data_set_trail(struct ceph_msg *msg, + struct ceph_pagelist *trail); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index fc59fcc9be77..d1183536d5a8 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2701,6 +2701,34 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, } EXPORT_SYMBOL(ceph_msg_data_set_pages); +void ceph_msg_data_set_pagelist(struct ceph_msg *msg, + struct ceph_pagelist *pagelist) +{ + /* BUG_ON(!pagelist); */ + /* BUG_ON(msg->pagelist); */ + + msg->pagelist = pagelist; +} +EXPORT_SYMBOL(ceph_msg_data_set_pagelist); + +void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio) +{ + /* BUG_ON(!bio); */ + /* BUG_ON(msg->bio); */ + + msg->bio = bio; +} +EXPORT_SYMBOL(ceph_msg_data_set_bio); + +void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail) +{ + /* BUG_ON(!trail); */ + /* BUG_ON(msg->trail); */ + + msg->trail = trail; +} +EXPORT_SYMBOL(ceph_msg_data_set_trail); + /* * construct a new message with given type, size * the new msg has a ref count of 1. diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f29bedac7310..387e3123d1ed 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1763,12 +1763,12 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, osd_data->length, osd_data->alignment); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - req->r_request->bio = osd_data->bio; + ceph_msg_data_set_bio(req->r_request, osd_data->bio); #endif } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } - req->r_request->trail = &req->r_trail; + ceph_msg_data_set_trail(req->r_request, &req->r_trail); register_request(osdc, req); @@ -2132,7 +2132,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, osd_data->length, osd_data->alignment); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - m->bio = osd_data->bio; + ceph_msg_data_set_bio(m, osd_data->bio); #endif } } -- cgit v1.2.3 From ebf18f47093e968105767eed4a0aa155e86b224e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 4 Mar 2013 22:29:57 -0600 Subject: ceph: only set message data pointers if non-empty Change it so we only assign outgoing data information for messages if there is outgoing data to send. This then allows us to add a few more (currently commented-out) assertions. This is related to: http://tracker.ceph.com/issues/4284 Signed-off-by: Alex Elder Reviewed-by: Greg Farnum --- fs/ceph/mds_client.c | 13 ++++++++++--- net/ceph/messenger.c | 4 ++++ net/ceph/osd_client.c | 9 ++++++--- 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3b2aa8702ae0..600d770d70f7 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1721,7 +1721,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_msg_data_set_pages(msg, req->r_pages, req->r_data_len, 0); + if (req->r_data_len) { + /* outbound data set only by ceph_sync_setxattr() */ + BUG_ON(!req->r_pages); + ceph_msg_data_set_pages(msg, req->r_pages, req->r_data_len, 0); + } msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); @@ -2603,10 +2607,13 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail; } - ceph_msg_data_set_pagelist(reply, pagelist); if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); - reply->hdr.data_len = cpu_to_le32(pagelist->length); + if (pagelist->length) { + /* set up outbound data if we have any */ + reply->hdr.data_len = cpu_to_le32(pagelist->length); + ceph_msg_data_set_pagelist(reply, pagelist); + } ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d1183536d5a8..1965d785cf83 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2692,6 +2692,8 @@ EXPORT_SYMBOL(ceph_con_keepalive); void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment) { + /* BUG_ON(!pages); */ + /* BUG_ON(!length); */ /* BUG_ON(msg->pages); */ /* BUG_ON(msg->page_count); */ @@ -2705,6 +2707,7 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist) { /* BUG_ON(!pagelist); */ + /* BUG_ON(!pagelist->length); */ /* BUG_ON(msg->pagelist); */ msg->pagelist = pagelist; @@ -2723,6 +2726,7 @@ EXPORT_SYMBOL(ceph_msg_data_set_bio); void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail) { /* BUG_ON(!trail); */ + /* BUG_ON(!trail->length); */ /* BUG_ON(msg->trail); */ msg->trail = trail; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 387e3123d1ed..4402e917b9b1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1759,8 +1759,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, osd_data = &req->r_data_out; if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { BUG_ON(osd_data->length > (u64) SIZE_MAX); - ceph_msg_data_set_pages(req->r_request, osd_data->pages, - osd_data->length, osd_data->alignment); + if (osd_data->length) + ceph_msg_data_set_pages(req->r_request, + osd_data->pages, osd_data->length, + osd_data->alignment); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { ceph_msg_data_set_bio(req->r_request, osd_data->bio); @@ -1768,7 +1770,8 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } - ceph_msg_data_set_trail(req->r_request, &req->r_trail); + if (req->r_trail.length) + ceph_msg_data_set_trail(req->r_request, &req->r_trail); register_request(osdc, req); -- cgit v1.2.3 From 4a73ef27ad04f1b8ea23eb55e50b20fcc0530a6f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Mar 2013 15:38:26 -0600 Subject: libceph: record message data byte length Record the number of bytes of data in a page array rather than the number of pages in the array. It can be assumed that the page array is of sufficient size to hold the number of bytes indicated (and offset by the indicated alignment). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 9d9be4682ac3..1991a6f9dc90 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -77,7 +77,7 @@ struct ceph_msg { struct page **pages; /* data payload. NOT OWNER. */ unsigned int page_alignment; /* io offset in first page */ - unsigned int page_count; /* # pages in array or list */ + size_t length; /* # data bytes in array or list */ struct ceph_pagelist *pagelist; /* instead of pages */ #ifdef CONFIG_BLOCK unsigned int bio_seg; /* current bio segment */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 1965d785cf83..f48e2af95005 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -809,11 +809,10 @@ static void prepare_write_message(struct ceph_connection *con) m->bio_iter = NULL; #endif - dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", + dout("prepare_write_message %p seq %lld type %d len %d+%d+%d (%zd)\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - le32_to_cpu(m->hdr.data_len), - m->page_count); + le32_to_cpu(m->hdr.data_len), m->length); BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ @@ -1091,9 +1090,8 @@ static int write_partial_msg_pages(struct ceph_connection *con) const size_t trail_len = (msg->trail ? msg->trail->length : 0); const size_t trail_off = data_len - trail_len; - dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, msg, con->out_msg_pos.page, msg->page_count, - con->out_msg_pos.page_pos); + dout("write_partial_msg_pages %p msg %p page %d offset %d\n", + con, msg, con->out_msg_pos.page, con->out_msg_pos.page_pos); /* * Iterate through each page that contains data to be @@ -2695,10 +2693,10 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, /* BUG_ON(!pages); */ /* BUG_ON(!length); */ /* BUG_ON(msg->pages); */ - /* BUG_ON(msg->page_count); */ + /* BUG_ON(msg->length); */ msg->pages = pages; - msg->page_count = calc_pages_for((u64)alignment, (u64)length); + msg->length = length; msg->page_alignment = alignment & ~PAGE_MASK; } EXPORT_SYMBOL(ceph_msg_data_set_pages); @@ -2906,7 +2904,7 @@ void ceph_msg_last_put(struct kref *kref) ceph_buffer_put(m->middle); m->middle = NULL; } - m->page_count = 0; + m->length = 0; m->pages = NULL; if (m->pagelist) { @@ -2926,8 +2924,8 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d page_count %d)\n", msg, - msg->front_max, msg->page_count); + pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, + msg->front_max, msg->length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); -- cgit v1.2.3 From 70636773b7c3c73677e1d653629dace7c21d14bf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 4 Mar 2013 18:29:06 -0600 Subject: libceph: set response data fields earlier When an incoming message is destined for the osd client, the messenger calls the osd client's alloc_msg method. That function looks up which request has the tid matching the incoming message, and returns the request message that was preallocated to receive the response. The response message is therefore known before the request is even started. Between the start of the request and the receipt of the response, the request and its data fields will not change, so there's no reason we need to hold off setting them. In fact it's preferable to set them just once because it's more obvious that they're unchanging. So set up the fields describing where incoming data is to land in a response message at the beginning of ceph_osdc_start_request(). Define a helper function that sets these fields, and use it to set the fields for both outgoing data in the request message and incoming data in the response. This resolves: http://tracker.ceph.com/issues/4284 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4402e917b9b1..37d89614a61b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1744,32 +1744,36 @@ bad: return; } -/* - * Register request, send initial attempt. - */ -int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail) +static void ceph_osdc_msg_data_set(struct ceph_msg *msg, + struct ceph_osd_data *osd_data) { - int rc = 0; - struct ceph_osd_data *osd_data; - - /* Set up outgoing data */ - - osd_data = &req->r_data_out; if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { BUG_ON(osd_data->length > (u64) SIZE_MAX); if (osd_data->length) - ceph_msg_data_set_pages(req->r_request, - osd_data->pages, osd_data->length, - osd_data->alignment); + ceph_msg_data_set_pages(msg, osd_data->pages, + osd_data->length, osd_data->alignment); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - ceph_msg_data_set_bio(req->r_request, osd_data->bio); + ceph_msg_data_set_bio(msg, osd_data->bio); #endif } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } +} + +/* + * Register request, send initial attempt. + */ +int ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) +{ + int rc = 0; + + /* Set up response incoming data and request outgoing data fields */ + + ceph_osdc_msg_data_set(req->r_reply, &req->r_data_in); + ceph_osdc_msg_data_set(req->r_request, &req->r_data_out); if (req->r_trail.length) ceph_msg_data_set_trail(req->r_request, &req->r_trail); @@ -2130,13 +2134,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = NULL; goto out; } - BUG_ON(osd_data->length > (u64) SIZE_MAX); - ceph_msg_data_set_pages(m, osd_data->pages, - osd_data->length, osd_data->alignment); -#ifdef CONFIG_BLOCK - } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - ceph_msg_data_set_bio(m, osd_data->bio); -#endif } } *skip = 0; -- cgit v1.2.3 From 07aa155878499f599a709eeecfaa0ca9ea764a88 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 4 Mar 2013 18:29:06 -0600 Subject: libceph: activate message data assignment checks The mds client no longer tries to assign zero-length message data, and the osd client no longer sets its data info more than once. This allows us to activate assertions in the messenger to verify these things never happen. This resolves both of these: http://tracker.ceph.com/issues/4263 http://tracker.ceph.com/issues/4284 Signed-off-by: Alex Elder Reviewed-by: Greg Farnum --- net/ceph/messenger.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f48e2af95005..e75a03d25c9f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2690,10 +2690,10 @@ EXPORT_SYMBOL(ceph_con_keepalive); void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment) { - /* BUG_ON(!pages); */ - /* BUG_ON(!length); */ - /* BUG_ON(msg->pages); */ - /* BUG_ON(msg->length); */ + BUG_ON(!pages); + BUG_ON(!length); + BUG_ON(msg->pages); + BUG_ON(msg->length); msg->pages = pages; msg->length = length; @@ -2704,9 +2704,9 @@ EXPORT_SYMBOL(ceph_msg_data_set_pages); void ceph_msg_data_set_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist) { - /* BUG_ON(!pagelist); */ - /* BUG_ON(!pagelist->length); */ - /* BUG_ON(msg->pagelist); */ + BUG_ON(!pagelist); + BUG_ON(!pagelist->length); + BUG_ON(msg->pagelist); msg->pagelist = pagelist; } @@ -2714,8 +2714,8 @@ EXPORT_SYMBOL(ceph_msg_data_set_pagelist); void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio) { - /* BUG_ON(!bio); */ - /* BUG_ON(msg->bio); */ + BUG_ON(!bio); + BUG_ON(msg->bio); msg->bio = bio; } @@ -2723,9 +2723,9 @@ EXPORT_SYMBOL(ceph_msg_data_set_bio); void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail) { - /* BUG_ON(!trail); */ - /* BUG_ON(!trail->length); */ - /* BUG_ON(msg->trail); */ + BUG_ON(!trail); + BUG_ON(!trail->length); + BUG_ON(msg->trail); msg->trail = trail; } -- cgit v1.2.3 From 98a0370898799895aa8f55109f54c33fcd8196b0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:39 -0600 Subject: libceph: don't clear bio_iter in prepare_write_message() At one time it was necessary to clear a message's bio_iter field to avoid a bad pointer dereference in write_partial_msg_pages(). That no longer seems to be the case. Here's why. The message's bio fields represent (in this case) outgoing data. Between where the bio_iter is made NULL in prepare_write_message() and the call in that function to prepare_message_data(), the bio fields are never used. In prepare_message_data(), init-bio_iter() is called, and the result of that overwrites the value in the message's bio_iter field. Because it gets overwritten anyway, there is no need to set it to NULL. So don't do it. This resolves: http://tracker.ceph.com/issues/4402 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e75a03d25c9f..17d9321b7134 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -804,10 +804,6 @@ static void prepare_write_message(struct ceph_connection *con) m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; } -#ifdef CONFIG_BLOCK - else - m->bio_iter = NULL; -#endif dout("prepare_write_message %p seq %lld type %d len %d+%d+%d (%zd)\n", m, con->out_seq, le16_to_cpu(m->hdr.type), -- cgit v1.2.3 From bae6acd9c65cbfeffc66a9f48ae91dca6e3aec85 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:38 -0600 Subject: libceph: use local variables for message positions There are several places where a message's out_msg_pos or in_msg_pos field is used repeatedly within a function. Use a local pointer variable for this purpose to unclutter the code. This and the upcoming cleanup patches are related to: http://tracker.ceph.com/issues/4403 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 85 ++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 17d9321b7134..7788170524e3 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -725,22 +725,23 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg) static void prepare_write_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; + struct ceph_msg_pos *msg_pos = &con->out_msg_pos; BUG_ON(!msg); BUG_ON(!msg->hdr.data_len); /* initialize page iterator */ - con->out_msg_pos.page = 0; + msg_pos->page = 0; if (msg->pages) - con->out_msg_pos.page_pos = msg->page_alignment; + msg_pos->page_pos = msg->page_alignment; else - con->out_msg_pos.page_pos = 0; + msg_pos->page_pos = 0; #ifdef CONFIG_BLOCK if (msg->bio) init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); #endif - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = false; + msg_pos->data_pos = 0; + msg_pos->did_page_crc = false; con->out_more = 1; /* data + footer will follow */ } @@ -1022,19 +1023,20 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, size_t len, size_t sent, bool in_trail) { struct ceph_msg *msg = con->out_msg; + struct ceph_msg_pos *msg_pos = &con->out_msg_pos; BUG_ON(!msg); BUG_ON(!sent); - con->out_msg_pos.data_pos += sent; - con->out_msg_pos.page_pos += sent; + msg_pos->data_pos += sent; + msg_pos->page_pos += sent; if (sent < len) return; BUG_ON(sent != len); - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = false; + msg_pos->page_pos = 0; + msg_pos->page++; + msg_pos->did_page_crc = false; if (in_trail) list_rotate_left(&msg->trail->head); else if (msg->pagelist) @@ -1049,18 +1051,19 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, size_t received) { struct ceph_msg *msg = con->in_msg; + struct ceph_msg_pos *msg_pos = &con->in_msg_pos; BUG_ON(!msg); BUG_ON(!received); - con->in_msg_pos.data_pos += received; - con->in_msg_pos.page_pos += received; + msg_pos->data_pos += received; + msg_pos->page_pos += received; if (received < len) return; BUG_ON(received != len); - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.page++; + msg_pos->page_pos = 0; + msg_pos->page++; #ifdef CONFIG_BLOCK if (msg->bio) iter_bio_next(&msg->bio_iter, &msg->bio_seg); @@ -1077,6 +1080,7 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, static int write_partial_msg_pages(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; + struct ceph_msg_pos *msg_pos = &con->out_msg_pos; unsigned int data_len = le32_to_cpu(msg->hdr.data_len); size_t len; bool do_datacrc = !con->msgr->nocrc; @@ -1087,7 +1091,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) const size_t trail_off = data_len - trail_len; dout("write_partial_msg_pages %p msg %p page %d offset %d\n", - con, msg, con->out_msg_pos.page, con->out_msg_pos.page_pos); + con, msg, msg_pos->page, msg_pos->page_pos); /* * Iterate through each page that contains data to be @@ -1097,22 +1101,22 @@ static int write_partial_msg_pages(struct ceph_connection *con) * need to map the page. If we have no pages, they have * been revoked, so use the zero page. */ - while (data_len > con->out_msg_pos.data_pos) { + while (data_len > msg_pos->data_pos) { struct page *page = NULL; int max_write = PAGE_SIZE; int bio_offset = 0; - in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; + in_trail = in_trail || msg_pos->data_pos >= trail_off; if (!in_trail) - total_max_write = trail_off - con->out_msg_pos.data_pos; + total_max_write = trail_off - msg_pos->data_pos; if (in_trail) { - total_max_write = data_len - con->out_msg_pos.data_pos; + total_max_write = data_len - msg_pos->data_pos; page = list_first_entry(&msg->trail->head, struct page, lru); } else if (msg->pages) { - page = msg->pages[con->out_msg_pos.page]; + page = msg->pages[msg_pos->page]; } else if (msg->pagelist) { page = list_first_entry(&msg->pagelist->head, struct page, lru); @@ -1128,24 +1132,24 @@ static int write_partial_msg_pages(struct ceph_connection *con) } else { page = zero_page; } - len = min_t(int, max_write - con->out_msg_pos.page_pos, + len = min_t(int, max_write - msg_pos->page_pos, total_max_write); - if (do_datacrc && !con->out_msg_pos.did_page_crc) { + if (do_datacrc && !msg_pos->did_page_crc) { void *base; u32 crc = le32_to_cpu(msg->footer.data_crc); char *kaddr; kaddr = kmap(page); BUG_ON(kaddr == NULL); - base = kaddr + con->out_msg_pos.page_pos + bio_offset; + base = kaddr + msg_pos->page_pos + bio_offset; crc = crc32c(crc, base, len); kunmap(page); msg->footer.data_crc = cpu_to_le32(crc); - con->out_msg_pos.did_page_crc = true; + msg_pos->did_page_crc = true; } ret = ceph_tcp_sendpage(con->sock, page, - con->out_msg_pos.page_pos + bio_offset, + msg_pos->page_pos + bio_offset, len, true); if (ret <= 0) goto out; @@ -1803,22 +1807,23 @@ static int read_partial_message_pages(struct ceph_connection *con, struct page **pages, unsigned int data_len, bool do_datacrc) { + struct ceph_msg_pos *msg_pos = &con->in_msg_pos; struct page *page; void *p; int ret; int left; - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); + left = min((int)(data_len - msg_pos->data_pos), + (int)(PAGE_SIZE - msg_pos->page_pos)); /* (page) data */ BUG_ON(pages == NULL); - page = pages[con->in_msg_pos.page]; + page = pages[msg_pos->page]; p = kmap(page); - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, left); + ret = ceph_tcp_recvmsg(con->sock, p + msg_pos->page_pos, left); if (ret > 0 && do_datacrc) con->in_data_crc = crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); + p + msg_pos->page_pos, ret); kunmap(page); if (ret <= 0) return ret; @@ -1833,6 +1838,7 @@ static int read_partial_message_bio(struct ceph_connection *con, unsigned int data_len, bool do_datacrc) { struct ceph_msg *msg = con->in_msg; + struct ceph_msg_pos *msg_pos = &con->in_msg_pos; struct bio_vec *bv; struct page *page; void *p; @@ -1842,17 +1848,17 @@ static int read_partial_message_bio(struct ceph_connection *con, BUG_ON(!msg->bio_iter); bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(bv->bv_len - con->in_msg_pos.page_pos)); + left = min((int)(data_len - msg_pos->data_pos), + (int)(bv->bv_len - msg_pos->page_pos)); page = bv->bv_page; p = kmap(page) + bv->bv_offset; - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, left); + ret = ceph_tcp_recvmsg(con->sock, p + msg_pos->page_pos, left); if (ret > 0 && do_datacrc) con->in_data_crc = crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); + p + msg_pos->page_pos, ret); kunmap(page); if (ret <= 0) return ret; @@ -1869,6 +1875,7 @@ static int read_partial_message_bio(struct ceph_connection *con, static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; + struct ceph_msg_pos *msg_pos = &con->in_msg_pos; int size; int end; int ret; @@ -1949,12 +1956,12 @@ static int read_partial_message(struct ceph_connection *con) if (m->middle) m->middle->vec.iov_len = 0; - con->in_msg_pos.page = 0; + msg_pos->page = 0; if (m->pages) - con->in_msg_pos.page_pos = m->page_alignment; + msg_pos->page_pos = m->page_alignment; else - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.data_pos = 0; + msg_pos->page_pos = 0; + msg_pos->data_pos = 0; #ifdef CONFIG_BLOCK if (m->bio) @@ -1978,7 +1985,7 @@ static int read_partial_message(struct ceph_connection *con) } /* (page) data */ - while (con->in_msg_pos.data_pos < data_len) { + while (msg_pos->data_pos < data_len) { if (m->pages) { ret = read_partial_message_pages(con, m->pages, data_len, do_datacrc); -- cgit v1.2.3 From 78625051b524e104332e69a9079d0ee9a2100cf2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:39 -0600 Subject: libceph: consolidate message prep code In prepare_write_message_data(), various fields are initialized in preparation for writing message data out. Meanwhile, in read_partial_message(), there is essentially the same block of code, operating on message variables associated with an incoming message. Generalize prepare_write_message_data() so it works for both incoming and outcoming messages, and use it in both spots. The did_page_crc is not used for input (so it's harmless to initialize it). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 7788170524e3..e8fa4497f424 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -722,11 +722,9 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg) } #endif -static void prepare_write_message_data(struct ceph_connection *con) +static void prepare_message_data(struct ceph_msg *msg, + struct ceph_msg_pos *msg_pos) { - struct ceph_msg *msg = con->out_msg; - struct ceph_msg_pos *msg_pos = &con->out_msg_pos; - BUG_ON(!msg); BUG_ON(!msg->hdr.data_len); @@ -742,7 +740,6 @@ static void prepare_write_message_data(struct ceph_connection *con) #endif msg_pos->data_pos = 0; msg_pos->did_page_crc = false; - con->out_more = 1; /* data + footer will follow */ } /* @@ -840,11 +837,13 @@ static void prepare_write_message(struct ceph_connection *con) /* is there a data payload? */ con->out_msg->footer.data_crc = 0; - if (m->hdr.data_len) - prepare_write_message_data(con); - else + if (m->hdr.data_len) { + prepare_message_data(con->out_msg, &con->out_msg_pos); + con->out_more = 1; /* data + footer will follow */ + } else { /* no, queue up footer too and be done */ prepare_write_message_footer(con); + } con_flag_set(con, CON_FLAG_WRITE_PENDING); } @@ -1956,17 +1955,10 @@ static int read_partial_message(struct ceph_connection *con) if (m->middle) m->middle->vec.iov_len = 0; - msg_pos->page = 0; - if (m->pages) - msg_pos->page_pos = m->page_alignment; - else - msg_pos->page_pos = 0; - msg_pos->data_pos = 0; + /* prepare for data payload, if any */ -#ifdef CONFIG_BLOCK - if (m->bio) - init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); -#endif + if (data_len) + prepare_message_data(con->in_msg, &con->in_msg_pos); } /* front */ -- cgit v1.2.3 From e387d525b0ceeecf07b074781eab77414dc9697e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:38 -0600 Subject: libceph: small write_partial_msg_pages() refactor Define local variables page_offset and length to represent the range of bytes within a page that will be sent by ceph_tcp_sendpage() in write_partial_msg_pages(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e8fa4497f424..813c29924d56 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1081,7 +1081,6 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct ceph_msg *msg = con->out_msg; struct ceph_msg_pos *msg_pos = &con->out_msg_pos; unsigned int data_len = le32_to_cpu(msg->hdr.data_len); - size_t len; bool do_datacrc = !con->msgr->nocrc; int ret; int total_max_write; @@ -1102,6 +1101,8 @@ static int write_partial_msg_pages(struct ceph_connection *con) */ while (data_len > msg_pos->data_pos) { struct page *page = NULL; + size_t page_offset; + size_t length; int max_write = PAGE_SIZE; int bio_offset = 0; @@ -1131,9 +1132,10 @@ static int write_partial_msg_pages(struct ceph_connection *con) } else { page = zero_page; } - len = min_t(int, max_write - msg_pos->page_pos, + length = min_t(int, max_write - msg_pos->page_pos, total_max_write); + page_offset = msg_pos->page_pos + bio_offset; if (do_datacrc && !msg_pos->did_page_crc) { void *base; u32 crc = le32_to_cpu(msg->footer.data_crc); @@ -1141,19 +1143,18 @@ static int write_partial_msg_pages(struct ceph_connection *con) kaddr = kmap(page); BUG_ON(kaddr == NULL); - base = kaddr + msg_pos->page_pos + bio_offset; - crc = crc32c(crc, base, len); + base = kaddr + page_offset; + crc = crc32c(crc, base, length); kunmap(page); msg->footer.data_crc = cpu_to_le32(crc); msg_pos->did_page_crc = true; } - ret = ceph_tcp_sendpage(con->sock, page, - msg_pos->page_pos + bio_offset, - len, true); + ret = ceph_tcp_sendpage(con->sock, page, page_offset, + length, true); if (ret <= 0) goto out; - out_msg_pos_next(con, page, len, (size_t) ret, in_trail); + out_msg_pos_next(con, page, length, (size_t) ret, in_trail); } dout("write_partial_msg_pages %p msg %p done\n", con, msg); -- cgit v1.2.3 From 34d2d2006cc82fd21f716e10568b8c8b4ef61c0e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 20:58:59 -0600 Subject: libceph: encapsulate reading message data Pull the code that reads the data portion into a message into a separate function read_partial_msg_data(). Rename write_partial_msg_pages() to be write_partial_message_data() to match its read counterpart, and to reflect its more generic purpose. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 63 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 813c29924d56..6e0bd36d676a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1076,7 +1076,7 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, * 0 -> socket full, but more to do * <0 -> error */ -static int write_partial_msg_pages(struct ceph_connection *con) +static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; struct ceph_msg_pos *msg_pos = &con->out_msg_pos; @@ -1088,7 +1088,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) const size_t trail_len = (msg->trail ? msg->trail->length : 0); const size_t trail_off = data_len - trail_len; - dout("write_partial_msg_pages %p msg %p page %d offset %d\n", + dout("%s %p msg %p page %d offset %d\n", __func__, con, msg, msg_pos->page, msg_pos->page_pos); /* @@ -1157,7 +1157,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) out_msg_pos_next(con, page, length, (size_t) ret, in_trail); } - dout("write_partial_msg_pages %p msg %p done\n", con, msg); + dout("%s %p msg %p done\n", __func__, con, msg); /* prepare and queue up footer, too */ if (!do_datacrc) @@ -1869,13 +1869,44 @@ static int read_partial_message_bio(struct ceph_connection *con, } #endif +static int read_partial_msg_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + struct ceph_msg_pos *msg_pos = &con->in_msg_pos; + const bool do_datacrc = !con->msgr->nocrc; + unsigned int data_len; + int ret; + + BUG_ON(!msg); + + data_len = le32_to_cpu(con->in_hdr.data_len); + while (msg_pos->data_pos < data_len) { + if (msg->pages) { + ret = read_partial_message_pages(con, msg->pages, + data_len, do_datacrc); + if (ret <= 0) + return ret; +#ifdef CONFIG_BLOCK + } else if (msg->bio) { + ret = read_partial_message_bio(con, + data_len, do_datacrc); + if (ret <= 0) + return ret; +#endif + } else { + BUG_ON(1); + } + } + + return 1; /* must return > 0 to indicate success */ +} + /* * read (part of) a message. */ static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; - struct ceph_msg_pos *msg_pos = &con->in_msg_pos; int size; int end; int ret; @@ -1978,22 +2009,10 @@ static int read_partial_message(struct ceph_connection *con) } /* (page) data */ - while (msg_pos->data_pos < data_len) { - if (m->pages) { - ret = read_partial_message_pages(con, m->pages, - data_len, do_datacrc); - if (ret <= 0) - return ret; -#ifdef CONFIG_BLOCK - } else if (m->bio) { - ret = read_partial_message_bio(con, - data_len, do_datacrc); - if (ret <= 0) - return ret; -#endif - } else { - BUG_ON(1); - } + if (data_len) { + ret = read_partial_msg_data(con); + if (ret <= 0) + return ret; } /* footer */ @@ -2119,13 +2138,13 @@ more_kvec: goto do_next; } - ret = write_partial_msg_pages(con); + ret = write_partial_message_data(con); if (ret == 1) goto more_kvec; /* we need to send the footer, too! */ if (ret == 0) goto out; if (ret < 0) { - dout("try_write write_partial_msg_pages err %d\n", + dout("try_write write_partial_message_data err %d\n", ret); goto out; } -- cgit v1.2.3 From afb3d90e205140415477d501ff9e2a33ff0b197f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 20:58:59 -0600 Subject: libceph: define and use ceph_tcp_recvpage() Define a new function ceph_tcp_recvpage() that behaves in a way comparable to ceph_tcp_sendpage(). Rearrange the code in both read_partial_message_pages() and read_partial_message_bio() so they have matching structure, (similar to what's in write_partial_msg_pages()), and use this new function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 86 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6e0bd36d676a..3120a6c81a76 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -471,6 +471,22 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) return r; } +static int ceph_tcp_recvpage(struct socket *sock, struct page *page, + int page_offset, size_t length) +{ + void *kaddr; + int ret; + + BUG_ON(page_offset + length > PAGE_SIZE); + + kaddr = kmap(page); + BUG_ON(!kaddr); + ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length); + kunmap(page); + + return ret; +} + /* * write something. @more is true if caller will be sending more data * shortly. @@ -1809,26 +1825,36 @@ static int read_partial_message_pages(struct ceph_connection *con, { struct ceph_msg_pos *msg_pos = &con->in_msg_pos; struct page *page; - void *p; + size_t page_offset; + size_t length; + unsigned int left; int ret; - int left; - left = min((int)(data_len - msg_pos->data_pos), - (int)(PAGE_SIZE - msg_pos->page_pos)); /* (page) data */ BUG_ON(pages == NULL); page = pages[msg_pos->page]; - p = kmap(page); - ret = ceph_tcp_recvmsg(con->sock, p + msg_pos->page_pos, left); - if (ret > 0 && do_datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + msg_pos->page_pos, ret); - kunmap(page); + page_offset = msg_pos->page_pos; + BUG_ON(msg_pos->data_pos >= data_len); + left = data_len - msg_pos->data_pos; + BUG_ON(page_offset >= PAGE_SIZE); + length = min_t(unsigned int, PAGE_SIZE - page_offset, left); + + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) return ret; - in_msg_pos_next(con, left, ret); + if (do_datacrc) { + void *kaddr; + void *base; + + kaddr = kmap(page); + BUG_ON(!kaddr); + base = kaddr + page_offset; + con->in_data_crc = crc32c(con->in_data_crc, base, ret); + kunmap(page); + } + + in_msg_pos_next(con, length, ret); return ret; } @@ -1841,29 +1867,37 @@ static int read_partial_message_bio(struct ceph_connection *con, struct ceph_msg_pos *msg_pos = &con->in_msg_pos; struct bio_vec *bv; struct page *page; - void *p; - int ret, left; + size_t page_offset; + size_t length; + unsigned int left; + int ret; BUG_ON(!msg); BUG_ON(!msg->bio_iter); bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); - - left = min((int)(data_len - msg_pos->data_pos), - (int)(bv->bv_len - msg_pos->page_pos)); - page = bv->bv_page; - p = kmap(page) + bv->bv_offset; + page_offset = bv->bv_offset + msg_pos->page_pos; + BUG_ON(msg_pos->data_pos >= data_len); + left = data_len - msg_pos->data_pos; + BUG_ON(msg_pos->page_pos >= bv->bv_len); + length = min_t(unsigned int, bv->bv_len - msg_pos->page_pos, left); - ret = ceph_tcp_recvmsg(con->sock, p + msg_pos->page_pos, left); - if (ret > 0 && do_datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + msg_pos->page_pos, ret); - kunmap(page); + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) return ret; - in_msg_pos_next(con, left, ret); + if (do_datacrc) { + void *kaddr; + void *base; + + kaddr = kmap(page); + BUG_ON(!kaddr); + base = kaddr + page_offset; + con->in_data_crc = crc32c(con->in_data_crc, base, ret); + kunmap(page); + } + + in_msg_pos_next(con, length, ret); return ret; } -- cgit v1.2.3 From 35b6280899424a0faf5410ce1ee86f9682528e6c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 20:59:00 -0600 Subject: libceph: define and use ceph_crc32c_page() Factor out a common block of code that updates a CRC calculation over a range of data in a page. This and the preceding patches are related to: http://tracker.ceph.com/issues/4403 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3120a6c81a76..f70bc92348d9 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1085,6 +1085,19 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, #endif /* CONFIG_BLOCK */ } +static u32 ceph_crc32c_page(u32 crc, struct page *page, + unsigned int page_offset, + unsigned int length) +{ + char *kaddr; + + kaddr = kmap(page); + BUG_ON(kaddr == NULL); + crc = crc32c(crc, kaddr + page_offset, length); + kunmap(page); + + return crc; +} /* * Write as much message data payload as we can. If we finish, queue * up the footer. @@ -1153,15 +1166,9 @@ static int write_partial_message_data(struct ceph_connection *con) page_offset = msg_pos->page_pos + bio_offset; if (do_datacrc && !msg_pos->did_page_crc) { - void *base; u32 crc = le32_to_cpu(msg->footer.data_crc); - char *kaddr; - kaddr = kmap(page); - BUG_ON(kaddr == NULL); - base = kaddr + page_offset; - crc = crc32c(crc, base, length); - kunmap(page); + crc = ceph_crc32c_page(crc, page, page_offset, length); msg->footer.data_crc = cpu_to_le32(crc); msg_pos->did_page_crc = true; } @@ -1843,16 +1850,9 @@ static int read_partial_message_pages(struct ceph_connection *con, if (ret <= 0) return ret; - if (do_datacrc) { - void *kaddr; - void *base; - - kaddr = kmap(page); - BUG_ON(!kaddr); - base = kaddr + page_offset; - con->in_data_crc = crc32c(con->in_data_crc, base, ret); - kunmap(page); - } + if (do_datacrc) + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, page, + page_offset, ret); in_msg_pos_next(con, length, ret); @@ -1886,16 +1886,9 @@ static int read_partial_message_bio(struct ceph_connection *con, if (ret <= 0) return ret; - if (do_datacrc) { - void *kaddr; - void *base; - - kaddr = kmap(page); - BUG_ON(!kaddr); - base = kaddr + page_offset; - con->in_data_crc = crc32c(con->in_data_crc, base, ret); - kunmap(page); - } + if (do_datacrc) + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, page, + page_offset, ret); in_msg_pos_next(con, length, ret); -- cgit v1.2.3 From 97fb1c7f6637ee61c90b8bc186d464cfd426b063 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:16 -0600 Subject: libceph: define ceph_msg_has_*() data macros Define and use macros ceph_msg_has_*() to determine whether to operate on the pages, pagelist, bio, and trail fields of a message. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 7 +++++++ net/ceph/messenger.c | 44 ++++++++++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 1991a6f9dc90..889fe4720133 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,6 +64,13 @@ struct ceph_messenger { u32 required_features; }; +#define ceph_msg_has_pages(m) ((m)->pages != NULL) +#define ceph_msg_has_pagelist(m) ((m)->pagelist != NULL) +#ifdef CONFIG_BLOCK +#define ceph_msg_has_bio(m) ((m)->bio != NULL) +#endif /* CONFIG_BLOCK */ +#define ceph_msg_has_trail(m) ((m)->trail != NULL) + /* * a single message. it contains a header (src, dest, message type, etc.), * footer (crc values, mainly), a "front" message body, and possibly a diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f70bc92348d9..c74b5289778a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -746,12 +746,12 @@ static void prepare_message_data(struct ceph_msg *msg, /* initialize page iterator */ msg_pos->page = 0; - if (msg->pages) + if (ceph_msg_has_pages(msg)) msg_pos->page_pos = msg->page_alignment; else msg_pos->page_pos = 0; #ifdef CONFIG_BLOCK - if (msg->bio) + if (ceph_msg_has_bio(msg)) init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); #endif msg_pos->data_pos = 0; @@ -1052,14 +1052,16 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->page_pos = 0; msg_pos->page++; msg_pos->did_page_crc = false; - if (in_trail) + if (in_trail) { + BUG_ON(!ceph_msg_has_trail(msg)); list_rotate_left(&msg->trail->head); - else if (msg->pagelist) + } else if (ceph_msg_has_pagelist(msg)) { list_rotate_left(&msg->pagelist->head); #ifdef CONFIG_BLOCK - else if (msg->bio) + } else if (ceph_msg_has_bio(msg)) { iter_bio_next(&msg->bio_iter, &msg->bio_seg); #endif + } } static void in_msg_pos_next(struct ceph_connection *con, size_t len, @@ -1114,8 +1116,13 @@ static int write_partial_message_data(struct ceph_connection *con) int ret; int total_max_write; bool in_trail = false; - const size_t trail_len = (msg->trail ? msg->trail->length : 0); - const size_t trail_off = data_len - trail_len; + size_t trail_len = 0; + size_t trail_off = data_len; + + if (ceph_msg_has_trail(msg)) { + trail_len = msg->trail->length; + trail_off -= trail_len; + } dout("%s %p msg %p page %d offset %d\n", __func__, con, msg, msg_pos->page, msg_pos->page_pos); @@ -1140,17 +1147,17 @@ static int write_partial_message_data(struct ceph_connection *con) total_max_write = trail_off - msg_pos->data_pos; if (in_trail) { + BUG_ON(!ceph_msg_has_trail(msg)); total_max_write = data_len - msg_pos->data_pos; - page = list_first_entry(&msg->trail->head, struct page, lru); - } else if (msg->pages) { + } else if (ceph_msg_has_pages(msg)) { page = msg->pages[msg_pos->page]; - } else if (msg->pagelist) { + } else if (ceph_msg_has_pagelist(msg)) { page = list_first_entry(&msg->pagelist->head, struct page, lru); #ifdef CONFIG_BLOCK - } else if (msg->bio) { + } else if (ceph_msg_has_bio(msg)) { struct bio_vec *bv; bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); @@ -1908,13 +1915,13 @@ static int read_partial_msg_data(struct ceph_connection *con) data_len = le32_to_cpu(con->in_hdr.data_len); while (msg_pos->data_pos < data_len) { - if (msg->pages) { + if (ceph_msg_has_pages(msg)) { ret = read_partial_message_pages(con, msg->pages, data_len, do_datacrc); if (ret <= 0) return ret; #ifdef CONFIG_BLOCK - } else if (msg->bio) { + } else if (ceph_msg_has_bio(msg)) { ret = read_partial_message_bio(con, data_len, do_datacrc); if (ret <= 0) @@ -2946,16 +2953,19 @@ void ceph_msg_last_put(struct kref *kref) ceph_buffer_put(m->middle); m->middle = NULL; } - m->length = 0; - m->pages = NULL; + if (ceph_msg_has_pages(m)) { + m->length = 0; + m->pages = NULL; + } - if (m->pagelist) { + if (ceph_msg_has_pagelist(m)) { ceph_pagelist_release(m->pagelist); kfree(m->pagelist); m->pagelist = NULL; } - m->trail = NULL; + if (ceph_msg_has_trail(m)) + m->trail = NULL; if (m->pool) ceph_msgpool_put(m->pool, m); -- cgit v1.2.3 From f9e15777afd87585f2222dfd446c2e52deb65eba Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:16 -0600 Subject: libceph: be explicit about message data representation A ceph message has a data payload portion. The memory for that data (either the source of data to send or the location to place data that is received) is specified in several ways. The ceph_msg structure includes fields for all of those ways, but this mispresents the fact that not all of them are used at a time. Specifically, the data in a message can be in: - an array of pages - a list of pages - a list of Linux bios - a second list of pages (the "trail") (The two page lists are currently only ever used for outgoing data.) Impose more structure on the ceph message, making the grouping of some of these fields explicit. Shorten the name of the "page_alignment" field. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 33 ++++++++++++-------- net/ceph/messenger.c | 68 +++++++++++++++++++++--------------------- 2 files changed, 55 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 889fe4720133..fb2b18a20c13 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,12 +64,12 @@ struct ceph_messenger { u32 required_features; }; -#define ceph_msg_has_pages(m) ((m)->pages != NULL) -#define ceph_msg_has_pagelist(m) ((m)->pagelist != NULL) +#define ceph_msg_has_pages(m) ((m)->p.pages != NULL) +#define ceph_msg_has_pagelist(m) ((m)->l.pagelist != NULL) #ifdef CONFIG_BLOCK -#define ceph_msg_has_bio(m) ((m)->bio != NULL) +#define ceph_msg_has_bio(m) ((m)->b.bio != NULL) #endif /* CONFIG_BLOCK */ -#define ceph_msg_has_trail(m) ((m)->trail != NULL) +#define ceph_msg_has_trail(m) ((m)->t.trail != NULL) /* * a single message. it contains a header (src, dest, message type, etc.), @@ -82,16 +82,25 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; - struct page **pages; /* data payload. NOT OWNER. */ - unsigned int page_alignment; /* io offset in first page */ - size_t length; /* # data bytes in array or list */ - struct ceph_pagelist *pagelist; /* instead of pages */ + /* data payload */ + struct { + struct page **pages; /* NOT OWNER. */ + size_t length; /* # data bytes in array */ + unsigned int alignment; /* first page */ + } p; + struct { + struct ceph_pagelist *pagelist; + } l; #ifdef CONFIG_BLOCK - unsigned int bio_seg; /* current bio segment */ - struct bio *bio; /* instead of pages/pagelist */ - struct bio *bio_iter; /* bio iterator */ + struct { + struct bio *bio_iter; /* iterator */ + struct bio *bio; + unsigned int bio_seg; /* current seg in bio */ + } b; #endif /* CONFIG_BLOCK */ - struct ceph_pagelist *trail; /* the trailing part of the data */ + struct { + struct ceph_pagelist *trail; /* trailing part of data */ + } t; struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index c74b5289778a..f485455f05a8 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -747,12 +747,12 @@ static void prepare_message_data(struct ceph_msg *msg, /* initialize page iterator */ msg_pos->page = 0; if (ceph_msg_has_pages(msg)) - msg_pos->page_pos = msg->page_alignment; + msg_pos->page_pos = msg->p.alignment; else msg_pos->page_pos = 0; #ifdef CONFIG_BLOCK if (ceph_msg_has_bio(msg)) - init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); + init_bio_iter(msg->b.bio, &msg->b.bio_iter, &msg->b.bio_seg); #endif msg_pos->data_pos = 0; msg_pos->did_page_crc = false; @@ -822,7 +822,7 @@ static void prepare_write_message(struct ceph_connection *con) dout("prepare_write_message %p seq %lld type %d len %d+%d+%d (%zd)\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - le32_to_cpu(m->hdr.data_len), m->length); + le32_to_cpu(m->hdr.data_len), m->p.length); BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ @@ -1054,12 +1054,12 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->did_page_crc = false; if (in_trail) { BUG_ON(!ceph_msg_has_trail(msg)); - list_rotate_left(&msg->trail->head); + list_rotate_left(&msg->t.trail->head); } else if (ceph_msg_has_pagelist(msg)) { - list_rotate_left(&msg->pagelist->head); + list_rotate_left(&msg->l.pagelist->head); #ifdef CONFIG_BLOCK } else if (ceph_msg_has_bio(msg)) { - iter_bio_next(&msg->bio_iter, &msg->bio_seg); + iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg); #endif } } @@ -1082,8 +1082,8 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, msg_pos->page_pos = 0; msg_pos->page++; #ifdef CONFIG_BLOCK - if (msg->bio) - iter_bio_next(&msg->bio_iter, &msg->bio_seg); + if (msg->b.bio) + iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg); #endif /* CONFIG_BLOCK */ } @@ -1120,7 +1120,7 @@ static int write_partial_message_data(struct ceph_connection *con) size_t trail_off = data_len; if (ceph_msg_has_trail(msg)) { - trail_len = msg->trail->length; + trail_len = msg->t.trail->length; trail_off -= trail_len; } @@ -1149,18 +1149,18 @@ static int write_partial_message_data(struct ceph_connection *con) if (in_trail) { BUG_ON(!ceph_msg_has_trail(msg)); total_max_write = data_len - msg_pos->data_pos; - page = list_first_entry(&msg->trail->head, + page = list_first_entry(&msg->t.trail->head, struct page, lru); } else if (ceph_msg_has_pages(msg)) { - page = msg->pages[msg_pos->page]; + page = msg->p.pages[msg_pos->page]; } else if (ceph_msg_has_pagelist(msg)) { - page = list_first_entry(&msg->pagelist->head, + page = list_first_entry(&msg->l.pagelist->head, struct page, lru); #ifdef CONFIG_BLOCK } else if (ceph_msg_has_bio(msg)) { struct bio_vec *bv; - bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); + bv = bio_iovec_idx(msg->b.bio_iter, msg->b.bio_seg); page = bv->bv_page; bio_offset = bv->bv_offset; max_write = bv->bv_len; @@ -1880,8 +1880,8 @@ static int read_partial_message_bio(struct ceph_connection *con, int ret; BUG_ON(!msg); - BUG_ON(!msg->bio_iter); - bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); + BUG_ON(!msg->b.bio_iter); + bv = bio_iovec_idx(msg->b.bio_iter, msg->b.bio_seg); page = bv->bv_page; page_offset = bv->bv_offset + msg_pos->page_pos; BUG_ON(msg_pos->data_pos >= data_len); @@ -1916,7 +1916,7 @@ static int read_partial_msg_data(struct ceph_connection *con) data_len = le32_to_cpu(con->in_hdr.data_len); while (msg_pos->data_pos < data_len) { if (ceph_msg_has_pages(msg)) { - ret = read_partial_message_pages(con, msg->pages, + ret = read_partial_message_pages(con, msg->p.pages, data_len, do_datacrc); if (ret <= 0) return ret; @@ -2741,12 +2741,12 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, { BUG_ON(!pages); BUG_ON(!length); - BUG_ON(msg->pages); - BUG_ON(msg->length); + BUG_ON(msg->p.pages); + BUG_ON(msg->p.length); - msg->pages = pages; - msg->length = length; - msg->page_alignment = alignment & ~PAGE_MASK; + msg->p.pages = pages; + msg->p.length = length; + msg->p.alignment = alignment & ~PAGE_MASK; } EXPORT_SYMBOL(ceph_msg_data_set_pages); @@ -2755,18 +2755,18 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, { BUG_ON(!pagelist); BUG_ON(!pagelist->length); - BUG_ON(msg->pagelist); + BUG_ON(msg->l.pagelist); - msg->pagelist = pagelist; + msg->l.pagelist = pagelist; } EXPORT_SYMBOL(ceph_msg_data_set_pagelist); void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio) { BUG_ON(!bio); - BUG_ON(msg->bio); + BUG_ON(msg->b.bio); - msg->bio = bio; + msg->b.bio = bio; } EXPORT_SYMBOL(ceph_msg_data_set_bio); @@ -2774,9 +2774,9 @@ void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail) { BUG_ON(!trail); BUG_ON(!trail->length); - BUG_ON(msg->trail); + BUG_ON(msg->t.trail); - msg->trail = trail; + msg->t.trail = trail; } EXPORT_SYMBOL(ceph_msg_data_set_trail); @@ -2954,18 +2954,18 @@ void ceph_msg_last_put(struct kref *kref) m->middle = NULL; } if (ceph_msg_has_pages(m)) { - m->length = 0; - m->pages = NULL; + m->p.length = 0; + m->p.pages = NULL; } if (ceph_msg_has_pagelist(m)) { - ceph_pagelist_release(m->pagelist); - kfree(m->pagelist); - m->pagelist = NULL; + ceph_pagelist_release(m->l.pagelist); + kfree(m->l.pagelist); + m->l.pagelist = NULL; } if (ceph_msg_has_trail(m)) - m->trail = NULL; + m->t.trail = NULL; if (m->pool) ceph_msgpool_put(m->pool, m); @@ -2977,7 +2977,7 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, - msg->front_max, msg->length); + msg->front_max, msg->p.length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); -- cgit v1.2.3 From 437945094fed0deb1810e8da95465c8f26bc6f80 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:16 -0600 Subject: libceph: abstract message data Group the types of message data into an abstract structure with a type indicator and a union containing fields appropriate to the type of data it represents. Use this to represent the pages, pagelist, bio, and trail in a ceph message. Verify message data is of type NONE in ceph_msg_data_set_*() routines. Since information about message data of type NONE really should not be interpreted, get rid of the other assertions in those functions. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 71 ++++++++++++++++++++++++++++++------------ net/ceph/messenger.c | 33 ++++++++++++++------ 2 files changed, 74 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index fb2b18a20c13..5860dd0c2caf 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,12 +64,55 @@ struct ceph_messenger { u32 required_features; }; -#define ceph_msg_has_pages(m) ((m)->p.pages != NULL) -#define ceph_msg_has_pagelist(m) ((m)->l.pagelist != NULL) +#define ceph_msg_has_pages(m) ((m)->p.type == CEPH_MSG_DATA_PAGES) +#define ceph_msg_has_pagelist(m) ((m)->l.type == CEPH_MSG_DATA_PAGELIST) #ifdef CONFIG_BLOCK -#define ceph_msg_has_bio(m) ((m)->b.bio != NULL) +#define ceph_msg_has_bio(m) ((m)->b.type == CEPH_MSG_DATA_BIO) #endif /* CONFIG_BLOCK */ -#define ceph_msg_has_trail(m) ((m)->t.trail != NULL) +#define ceph_msg_has_trail(m) ((m)->t.type == CEPH_MSG_DATA_PAGELIST) + +enum ceph_msg_data_type { + CEPH_MSG_DATA_NONE, /* message contains no data payload */ + CEPH_MSG_DATA_PAGES, /* data source/destination is a page array */ + CEPH_MSG_DATA_PAGELIST, /* data source/destination is a pagelist */ +#ifdef CONFIG_BLOCK + CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ +#endif /* CONFIG_BLOCK */ +}; + +static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) +{ + switch (type) { + case CEPH_MSG_DATA_NONE: + case CEPH_MSG_DATA_PAGES: + case CEPH_MSG_DATA_PAGELIST: +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: +#endif /* CONFIG_BLOCK */ + return true; + default: + return false; + } +} + +struct ceph_msg_data { + enum ceph_msg_data_type type; + union { +#ifdef CONFIG_BLOCK + struct { + struct bio *bio_iter; /* iterator */ + struct bio *bio; + unsigned int bio_seg; /* current seg in bio */ + }; +#endif /* CONFIG_BLOCK */ + struct { + struct page **pages; /* NOT OWNER. */ + size_t length; /* total # bytes */ + unsigned int alignment; /* first page */ + }; + struct ceph_pagelist *pagelist; + }; +}; /* * a single message. it contains a header (src, dest, message type, etc.), @@ -83,24 +126,12 @@ struct ceph_msg { struct ceph_buffer *middle; /* data payload */ - struct { - struct page **pages; /* NOT OWNER. */ - size_t length; /* # data bytes in array */ - unsigned int alignment; /* first page */ - } p; - struct { - struct ceph_pagelist *pagelist; - } l; + struct ceph_msg_data p; /* pages */ + struct ceph_msg_data l; /* pagelist */ #ifdef CONFIG_BLOCK - struct { - struct bio *bio_iter; /* iterator */ - struct bio *bio; - unsigned int bio_seg; /* current seg in bio */ - } b; + struct ceph_msg_data b; /* bio */ #endif /* CONFIG_BLOCK */ - struct { - struct ceph_pagelist *trail; /* trailing part of data */ - } t; + struct ceph_msg_data t; /* trail */ struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f485455f05a8..f256b4b174ad 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1054,7 +1054,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->did_page_crc = false; if (in_trail) { BUG_ON(!ceph_msg_has_trail(msg)); - list_rotate_left(&msg->t.trail->head); + list_rotate_left(&msg->t.pagelist->head); } else if (ceph_msg_has_pagelist(msg)) { list_rotate_left(&msg->l.pagelist->head); #ifdef CONFIG_BLOCK @@ -1120,7 +1120,7 @@ static int write_partial_message_data(struct ceph_connection *con) size_t trail_off = data_len; if (ceph_msg_has_trail(msg)) { - trail_len = msg->t.trail->length; + trail_len = msg->t.pagelist->length; trail_off -= trail_len; } @@ -1149,7 +1149,7 @@ static int write_partial_message_data(struct ceph_connection *con) if (in_trail) { BUG_ON(!ceph_msg_has_trail(msg)); total_max_write = data_len - msg_pos->data_pos; - page = list_first_entry(&msg->t.trail->head, + page = list_first_entry(&msg->t.pagelist->head, struct page, lru); } else if (ceph_msg_has_pages(msg)) { page = msg->p.pages[msg_pos->page]; @@ -2736,14 +2736,19 @@ void ceph_con_keepalive(struct ceph_connection *con) } EXPORT_SYMBOL(ceph_con_keepalive); +static void ceph_msg_data_init(struct ceph_msg_data *data) +{ + data->type = CEPH_MSG_DATA_NONE; +} + void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment) { BUG_ON(!pages); BUG_ON(!length); - BUG_ON(msg->p.pages); - BUG_ON(msg->p.length); + BUG_ON(msg->p.type != CEPH_MSG_DATA_NONE); + msg->p.type = CEPH_MSG_DATA_PAGES; msg->p.pages = pages; msg->p.length = length; msg->p.alignment = alignment & ~PAGE_MASK; @@ -2755,8 +2760,9 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, { BUG_ON(!pagelist); BUG_ON(!pagelist->length); - BUG_ON(msg->l.pagelist); + BUG_ON(msg->l.type != CEPH_MSG_DATA_NONE); + msg->l.type = CEPH_MSG_DATA_PAGELIST; msg->l.pagelist = pagelist; } EXPORT_SYMBOL(ceph_msg_data_set_pagelist); @@ -2764,8 +2770,9 @@ EXPORT_SYMBOL(ceph_msg_data_set_pagelist); void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio) { BUG_ON(!bio); - BUG_ON(msg->b.bio); + BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE); + msg->b.type = CEPH_MSG_DATA_BIO; msg->b.bio = bio; } EXPORT_SYMBOL(ceph_msg_data_set_bio); @@ -2774,9 +2781,10 @@ void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail) { BUG_ON(!trail); BUG_ON(!trail->length); - BUG_ON(msg->t.trail); + BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE); - msg->t.trail = trail; + msg->t.type = CEPH_MSG_DATA_PAGELIST; + msg->t.pagelist = trail; } EXPORT_SYMBOL(ceph_msg_data_set_trail); @@ -2800,6 +2808,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); + ceph_msg_data_init(&m->p); + ceph_msg_data_init(&m->l); + ceph_msg_data_init(&m->b); + ceph_msg_data_init(&m->t); + /* front */ m->front_max = front_len; if (front_len) { @@ -2965,7 +2978,7 @@ void ceph_msg_last_put(struct kref *kref) } if (ceph_msg_has_trail(m)) - m->t.trail = NULL; + m->t.pagelist = NULL; if (m->pool) ceph_msgpool_put(m->pool, m); -- cgit v1.2.3 From fe38a2b67bc6b3a60da82a23e9082256a30e39d9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:39 -0600 Subject: libceph: start defining message data cursor This patch lays out the foundation for using generic routines to manage processing items of message data. For simplicity, we'll start with just the trail portion of a message, because it stands alone and is only present for outgoing data. First some basic concepts. We'll use the term "data item" to represent one of the ceph_msg_data structures associated with a message. There are currently four of those, with single-letter field names p, l, b, and t. A data item is further broken into "pieces" which always lie in a single page. A data item will include a "cursor" that will track state as the memory defined by the item is consumed by sending data from or receiving data into it. We define three routines to manipulate a data item's cursor: the "init" routine; the "next" routine; and the "advance" routine. The "init" routine initializes the cursor so it points at the beginning of the first piece in the item. The "next" routine returns the page, page offset, and length (limited by both the page and item size) of the next unconsumed piece in the item. It also indicates to the caller whether the piece being returned is the last one in the data item. The "advance" routine consumes the requested number of bytes in the item (advancing the cursor). This is used to record the number of bytes from the current piece that were actually sent or received by the network code. It returns an indication of whether the result means the current piece has been fully consumed. This is used by the message send code to determine whether it should calculate the CRC for the next piece processed. The trail of a message is implemented as a ceph pagelist. The routines defined for it will be usable for non-trail pagelist data as well. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 7 +++ net/ceph/messenger.c | 138 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 135 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 5860dd0c2caf..14862438faff 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -95,6 +95,12 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) } } +struct ceph_msg_data_cursor { + bool last_piece; /* now at last piece of data item */ + struct page *page; /* current page in pagelist */ + size_t offset; /* pagelist bytes consumed */ +}; + struct ceph_msg_data { enum ceph_msg_data_type type; union { @@ -112,6 +118,7 @@ struct ceph_msg_data { }; struct ceph_pagelist *pagelist; }; + struct ceph_msg_data_cursor cursor; /* pagelist only */ }; /* diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f256b4b174ad..b978cf8b27ff 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -21,6 +21,9 @@ #include #include +#define list_entry_next(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable @@ -738,6 +741,109 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg) } #endif +/* + * Message data is handled (sent or received) in pieces, where each + * piece resides on a single page. The network layer might not + * consume an entire piece at once. A data item's cursor keeps + * track of which piece is next to process and how much remains to + * be processed in that piece. It also tracks whether the current + * piece is the last one in the data item. + */ +static void ceph_msg_data_cursor_init(struct ceph_msg_data *data) +{ + struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_pagelist *pagelist; + struct page *page; + + if (data->type != CEPH_MSG_DATA_PAGELIST) + return; + + pagelist = data->pagelist; + BUG_ON(!pagelist); + if (!pagelist->length) + return; /* pagelist can be assigned but empty */ + + BUG_ON(list_empty(&pagelist->head)); + page = list_first_entry(&pagelist->head, struct page, lru); + + cursor->page = page; + cursor->offset = 0; + cursor->last_piece = pagelist->length <= PAGE_SIZE; +} + +/* + * Return the page containing the next piece to process for a given + * data item, and supply the page offset and length of that piece. + * Indicate whether this is the last piece in this data item. + */ +static struct page *ceph_msg_data_next(struct ceph_msg_data *data, + size_t *page_offset, + size_t *length, + bool *last_piece) +{ + struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_pagelist *pagelist; + size_t piece_end; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + BUG_ON(!cursor->page); + BUG_ON(cursor->offset >= pagelist->length); + + *last_piece = cursor->last_piece; + if (*last_piece) { + /* pagelist offset is always 0 */ + piece_end = pagelist->length & ~PAGE_MASK; + if (!piece_end) + piece_end = PAGE_SIZE; + } else { + piece_end = PAGE_SIZE; + } + *page_offset = cursor->offset & ~PAGE_MASK; + *length = piece_end - *page_offset; + + return data->cursor.page; +} + +/* + * Returns true if the result moves the cursor on to the next piece + * (the next page) of the pagelist. + */ +static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) +{ + struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_pagelist *pagelist; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + BUG_ON(!cursor->page); + BUG_ON(cursor->offset + bytes > pagelist->length); + BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); + + /* Advance the cursor offset */ + + cursor->offset += bytes; + /* pagelist offset is always 0 */ + if (!bytes || cursor->offset & ~PAGE_MASK) + return false; /* more bytes to process in the current page */ + + /* Move on to the next page */ + + BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); + cursor->page = list_entry_next(cursor->page, lru); + + /* cursor offset is at page boundary; pagelist offset is always 0 */ + if (pagelist->length - cursor->offset <= PAGE_SIZE) + cursor->last_piece = true; + + return true; +} + static void prepare_message_data(struct ceph_msg *msg, struct ceph_msg_pos *msg_pos) { @@ -755,6 +861,12 @@ static void prepare_message_data(struct ceph_msg *msg, init_bio_iter(msg->b.bio, &msg->b.bio_iter, &msg->b.bio_seg); #endif msg_pos->data_pos = 0; + + /* If there's a trail, initialize its cursor */ + + if (ceph_msg_has_trail(msg)) + ceph_msg_data_cursor_init(&msg->t); + msg_pos->did_page_crc = false; } @@ -1045,6 +1157,12 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->data_pos += sent; msg_pos->page_pos += sent; + if (in_trail) { + bool need_crc; + + need_crc = ceph_msg_data_advance(&msg->t, sent); + BUG_ON(need_crc && sent != len); + } if (sent < len) return; @@ -1052,10 +1170,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->page_pos = 0; msg_pos->page++; msg_pos->did_page_crc = false; - if (in_trail) { - BUG_ON(!ceph_msg_has_trail(msg)); - list_rotate_left(&msg->t.pagelist->head); - } else if (ceph_msg_has_pagelist(msg)) { + if (ceph_msg_has_pagelist(msg)) { list_rotate_left(&msg->l.pagelist->head); #ifdef CONFIG_BLOCK } else if (ceph_msg_has_bio(msg)) { @@ -1141,6 +1256,8 @@ static int write_partial_message_data(struct ceph_connection *con) size_t length; int max_write = PAGE_SIZE; int bio_offset = 0; + bool use_cursor = false; + bool last_piece = true; /* preserve existing behavior */ in_trail = in_trail || msg_pos->data_pos >= trail_off; if (!in_trail) @@ -1148,9 +1265,9 @@ static int write_partial_message_data(struct ceph_connection *con) if (in_trail) { BUG_ON(!ceph_msg_has_trail(msg)); - total_max_write = data_len - msg_pos->data_pos; - page = list_first_entry(&msg->t.pagelist->head, - struct page, lru); + use_cursor = true; + page = ceph_msg_data_next(&msg->t, &page_offset, + &length, &last_piece); } else if (ceph_msg_has_pages(msg)) { page = msg->p.pages[msg_pos->page]; } else if (ceph_msg_has_pagelist(msg)) { @@ -1168,8 +1285,9 @@ static int write_partial_message_data(struct ceph_connection *con) } else { page = zero_page; } - length = min_t(int, max_write - msg_pos->page_pos, - total_max_write); + if (!use_cursor) + length = min_t(int, max_write - msg_pos->page_pos, + total_max_write); page_offset = msg_pos->page_pos + bio_offset; if (do_datacrc && !msg_pos->did_page_crc) { @@ -1180,7 +1298,7 @@ static int write_partial_message_data(struct ceph_connection *con) msg_pos->did_page_crc = true; } ret = ceph_tcp_sendpage(con->sock, page, page_offset, - length, true); + length, last_piece); if (ret <= 0) goto out; -- cgit v1.2.3 From dd236fcb65d7b6b80c408cb5f66aab55f4594284 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:39 -0600 Subject: libceph: prepare for other message data item types This just inserts some infrastructure in preparation for handling other types of ceph message data items. No functional changes, just trying to simplify review by separating out some noise. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 8 ++- net/ceph/messenger.c | 117 ++++++++++++++++++++++++++++++++--------- 2 files changed, 99 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 14862438faff..716c3fdeb257 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -97,8 +97,12 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) struct ceph_msg_data_cursor { bool last_piece; /* now at last piece of data item */ - struct page *page; /* current page in pagelist */ - size_t offset; /* pagelist bytes consumed */ + union { + struct { /* pagelist */ + struct page *page; /* page from list */ + size_t offset; /* bytes from list */ + }; + }; }; struct ceph_msg_data { diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b978cf8b27ff..4cc27a136e35 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -742,21 +742,16 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg) #endif /* - * Message data is handled (sent or received) in pieces, where each - * piece resides on a single page. The network layer might not - * consume an entire piece at once. A data item's cursor keeps - * track of which piece is next to process and how much remains to - * be processed in that piece. It also tracks whether the current - * piece is the last one in the data item. + * For a pagelist, a piece is whatever remains to be consumed in the + * first page in the list, or the front of the next page. */ -static void ceph_msg_data_cursor_init(struct ceph_msg_data *data) +static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data) { struct ceph_msg_data_cursor *cursor = &data->cursor; struct ceph_pagelist *pagelist; struct page *page; - if (data->type != CEPH_MSG_DATA_PAGELIST) - return; + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); @@ -771,15 +766,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data) cursor->last_piece = pagelist->length <= PAGE_SIZE; } -/* - * Return the page containing the next piece to process for a given - * data item, and supply the page offset and length of that piece. - * Indicate whether this is the last piece in this data item. - */ -static struct page *ceph_msg_data_next(struct ceph_msg_data *data, +static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data, size_t *page_offset, - size_t *length, - bool *last_piece) + size_t *length) { struct ceph_msg_data_cursor *cursor = &data->cursor; struct ceph_pagelist *pagelist; @@ -793,8 +782,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data, BUG_ON(!cursor->page); BUG_ON(cursor->offset >= pagelist->length); - *last_piece = cursor->last_piece; - if (*last_piece) { + if (cursor->last_piece) { /* pagelist offset is always 0 */ piece_end = pagelist->length & ~PAGE_MASK; if (!piece_end) @@ -808,11 +796,8 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data, return data->cursor.page; } -/* - * Returns true if the result moves the cursor on to the next piece - * (the next page) of the pagelist. - */ -static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) +static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data, + size_t bytes) { struct ceph_msg_data_cursor *cursor = &data->cursor; struct ceph_pagelist *pagelist; @@ -844,6 +829,90 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) return true; } +/* + * Message data is handled (sent or received) in pieces, where each + * piece resides on a single page. The network layer might not + * consume an entire piece at once. A data item's cursor keeps + * track of which piece is next to process and how much remains to + * be processed in that piece. It also tracks whether the current + * piece is the last one in the data item. + */ +static void ceph_msg_data_cursor_init(struct ceph_msg_data *data) +{ + switch (data->type) { + case CEPH_MSG_DATA_PAGELIST: + ceph_msg_data_pagelist_cursor_init(data); + break; + case CEPH_MSG_DATA_NONE: + case CEPH_MSG_DATA_PAGES: +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: +#endif /* CONFIG_BLOCK */ + default: + /* BUG(); */ + break; + } +} + +/* + * Return the page containing the next piece to process for a given + * data item, and supply the page offset and length of that piece. + * Indicate whether this is the last piece in this data item. + */ +static struct page *ceph_msg_data_next(struct ceph_msg_data *data, + size_t *page_offset, + size_t *length, + bool *last_piece) +{ + struct page *page; + + switch (data->type) { + case CEPH_MSG_DATA_PAGELIST: + page = ceph_msg_data_pagelist_next(data, page_offset, length); + break; + case CEPH_MSG_DATA_NONE: + case CEPH_MSG_DATA_PAGES: +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: +#endif /* CONFIG_BLOCK */ + default: + page = NULL; + break; + } + BUG_ON(!page); + BUG_ON(*page_offset + *length > PAGE_SIZE); + BUG_ON(!*length); + if (last_piece) + *last_piece = data->cursor.last_piece; + + return page; +} + +/* + * Returns true if the result moves the cursor on to the next piece + * of the data item. + */ +static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) +{ + bool new_piece; + + switch (data->type) { + case CEPH_MSG_DATA_PAGELIST: + new_piece = ceph_msg_data_pagelist_advance(data, bytes); + break; + case CEPH_MSG_DATA_NONE: + case CEPH_MSG_DATA_PAGES: +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: +#endif /* CONFIG_BLOCK */ + default: + BUG(); + break; + } + + return new_piece; +} + static void prepare_message_data(struct ceph_msg *msg, struct ceph_msg_pos *msg_pos) { -- cgit v1.2.3 From 7fe1e5e57b84eab98ff352519aa66e86dac5bf61 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:39 -0600 Subject: libceph: use data cursor for message pagelist Switch to using the message cursor for the (non-trail) outgoing pagelist data item in a message if present. Notes on the logic changes in out_msg_pos_next(): - only the mds client uses a ceph pagelist for message data; - if the mds client ever uses a pagelist, it never uses a page array (or anything else, for that matter) for data in the same message; - only the osd client uses the trail portion of a message data, and when it does, it never uses any other data fields for outgoing data in the same message; and finally - only the rbd client uses bio message data (never pagelist). Therefore out_msg_pos_next() can assume: - if we're in the trail portion of a message, the message data pagelist, data, and bio can be ignored; and - if there is a page list, there will never be any a bio or page array data, and vice-versa. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4cc27a136e35..30c8792be180 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -931,8 +931,10 @@ static void prepare_message_data(struct ceph_msg *msg, #endif msg_pos->data_pos = 0; - /* If there's a trail, initialize its cursor */ + /* Initialize data cursors */ + if (ceph_msg_has_pagelist(msg)) + ceph_msg_data_cursor_init(&msg->l); if (ceph_msg_has_trail(msg)) ceph_msg_data_cursor_init(&msg->t); @@ -1220,18 +1222,19 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, { struct ceph_msg *msg = con->out_msg; struct ceph_msg_pos *msg_pos = &con->out_msg_pos; + bool need_crc = false; BUG_ON(!msg); BUG_ON(!sent); msg_pos->data_pos += sent; msg_pos->page_pos += sent; - if (in_trail) { - bool need_crc; - + if (in_trail) need_crc = ceph_msg_data_advance(&msg->t, sent); - BUG_ON(need_crc && sent != len); - } + else if (ceph_msg_has_pagelist(msg)) + need_crc = ceph_msg_data_advance(&msg->l, sent); + BUG_ON(need_crc && sent != len); + if (sent < len) return; @@ -1239,13 +1242,10 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->page_pos = 0; msg_pos->page++; msg_pos->did_page_crc = false; - if (ceph_msg_has_pagelist(msg)) { - list_rotate_left(&msg->l.pagelist->head); #ifdef CONFIG_BLOCK - } else if (ceph_msg_has_bio(msg)) { + if (ceph_msg_has_bio(msg)) iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg); #endif - } } static void in_msg_pos_next(struct ceph_connection *con, size_t len, @@ -1340,8 +1340,9 @@ static int write_partial_message_data(struct ceph_connection *con) } else if (ceph_msg_has_pages(msg)) { page = msg->p.pages[msg_pos->page]; } else if (ceph_msg_has_pagelist(msg)) { - page = list_first_entry(&msg->l.pagelist->head, - struct page, lru); + use_cursor = true; + page = ceph_msg_data_next(&msg->l, &page_offset, + &length, &last_piece); #ifdef CONFIG_BLOCK } else if (ceph_msg_has_bio(msg)) { struct bio_vec *bv; -- cgit v1.2.3 From 6aaa4511deb4b0fd776d1153dc63a89cdc024fb8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:39 -0600 Subject: libceph: implement bio message data item cursor Implement and use cursor routines for bio message data items for outbound message data. (See the previous commit for reasoning in support of the changes in out_msg_pos_next().) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 7 +++ net/ceph/messenger.c | 137 ++++++++++++++++++++++++++++++++++------- 2 files changed, 123 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 716c3fdeb257..76b4645e2dff 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -98,6 +98,13 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) struct ceph_msg_data_cursor { bool last_piece; /* now at last piece of data item */ union { +#ifdef CONFIG_BLOCK + struct { /* bio */ + struct bio *bio; /* bio from list */ + unsigned int vector_index; /* vector from bio */ + unsigned int vector_offset; /* bytes from vector */ + }; +#endif /* CONFIG_BLOCK */ struct { /* pagelist */ struct page *page; /* page from list */ size_t offset; /* bytes from list */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 30c8792be180..209990a853e5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -739,6 +739,95 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg) if (*seg == (*bio_iter)->bi_vcnt) init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); } + +/* + * For a bio data item, a piece is whatever remains of the next + * entry in the current bio iovec, or the first entry in the next + * bio in the list. + */ +static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data) +{ + struct ceph_msg_data_cursor *cursor = &data->cursor; + struct bio *bio; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = data->bio; + BUG_ON(!bio); + BUG_ON(!bio->bi_vcnt); + /* resid = bio->bi_size */ + + cursor->bio = bio; + cursor->vector_index = 0; + cursor->vector_offset = 0; + cursor->last_piece = !bio->bi_next && bio->bi_vcnt == 1; +} + +static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data, + size_t *page_offset, + size_t *length) +{ + struct ceph_msg_data_cursor *cursor = &data->cursor; + struct bio *bio; + struct bio_vec *bio_vec; + unsigned int index; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + index = cursor->vector_index; + BUG_ON(index >= (unsigned int) bio->bi_vcnt); + + bio_vec = &bio->bi_io_vec[index]; + BUG_ON(cursor->vector_offset >= bio_vec->bv_len); + *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset); + BUG_ON(*page_offset >= PAGE_SIZE); + *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); + BUG_ON(*length > PAGE_SIZE); + + return bio_vec->bv_page; +} + +static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes) +{ + struct ceph_msg_data_cursor *cursor = &data->cursor; + struct bio *bio; + struct bio_vec *bio_vec; + unsigned int index; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + index = cursor->vector_index; + BUG_ON(index >= (unsigned int) bio->bi_vcnt); + bio_vec = &bio->bi_io_vec[index]; + BUG_ON(cursor->vector_offset + bytes > bio_vec->bv_len); + + /* Advance the cursor offset */ + + cursor->vector_offset += bytes; + if (cursor->vector_offset < bio_vec->bv_len) + return false; /* more bytes to process in this segment */ + + /* Move on to the next segment, and possibly the next bio */ + + if (++cursor->vector_index == (unsigned int) bio->bi_vcnt) { + bio = bio->bi_next; + cursor->bio = bio; + cursor->vector_index = 0; + } + cursor->vector_offset = 0; + + if (!cursor->last_piece && bio && !bio->bi_next) + if (cursor->vector_index == (unsigned int) bio->bi_vcnt - 1) + cursor->last_piece = true; + + return true; +} #endif /* @@ -843,11 +932,13 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data) case CEPH_MSG_DATA_PAGELIST: ceph_msg_data_pagelist_cursor_init(data); break; - case CEPH_MSG_DATA_NONE: - case CEPH_MSG_DATA_PAGES: #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: + ceph_msg_data_bio_cursor_init(data); + break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + case CEPH_MSG_DATA_PAGES: default: /* BUG(); */ break; @@ -870,11 +961,13 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data, case CEPH_MSG_DATA_PAGELIST: page = ceph_msg_data_pagelist_next(data, page_offset, length); break; - case CEPH_MSG_DATA_NONE: - case CEPH_MSG_DATA_PAGES: #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: + page = ceph_msg_data_bio_next(data, page_offset, length); + break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + case CEPH_MSG_DATA_PAGES: default: page = NULL; break; @@ -900,11 +993,13 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) case CEPH_MSG_DATA_PAGELIST: new_piece = ceph_msg_data_pagelist_advance(data, bytes); break; - case CEPH_MSG_DATA_NONE: - case CEPH_MSG_DATA_PAGES: #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: + new_piece = ceph_msg_data_bio_advance(data, bytes); + break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + case CEPH_MSG_DATA_PAGES: default: BUG(); break; @@ -933,6 +1028,10 @@ static void prepare_message_data(struct ceph_msg *msg, /* Initialize data cursors */ +#ifdef CONFIG_BLOCK + if (ceph_msg_has_bio(msg)) + ceph_msg_data_cursor_init(&msg->b); +#endif /* CONFIG_BLOCK */ if (ceph_msg_has_pagelist(msg)) ceph_msg_data_cursor_init(&msg->l); if (ceph_msg_has_trail(msg)) @@ -1233,6 +1332,10 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, need_crc = ceph_msg_data_advance(&msg->t, sent); else if (ceph_msg_has_pagelist(msg)) need_crc = ceph_msg_data_advance(&msg->l, sent); +#ifdef CONFIG_BLOCK + else if (ceph_msg_has_bio(msg)) + need_crc = ceph_msg_data_advance(&msg->b, sent); +#endif /* CONFIG_BLOCK */ BUG_ON(need_crc && sent != len); if (sent < len) @@ -1242,10 +1345,6 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->page_pos = 0; msg_pos->page++; msg_pos->did_page_crc = false; -#ifdef CONFIG_BLOCK - if (ceph_msg_has_bio(msg)) - iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg); -#endif } static void in_msg_pos_next(struct ceph_connection *con, size_t len, @@ -1323,8 +1422,6 @@ static int write_partial_message_data(struct ceph_connection *con) struct page *page = NULL; size_t page_offset; size_t length; - int max_write = PAGE_SIZE; - int bio_offset = 0; bool use_cursor = false; bool last_piece = true; /* preserve existing behavior */ @@ -1345,21 +1442,19 @@ static int write_partial_message_data(struct ceph_connection *con) &length, &last_piece); #ifdef CONFIG_BLOCK } else if (ceph_msg_has_bio(msg)) { - struct bio_vec *bv; - - bv = bio_iovec_idx(msg->b.bio_iter, msg->b.bio_seg); - page = bv->bv_page; - bio_offset = bv->bv_offset; - max_write = bv->bv_len; + use_cursor = true; + page = ceph_msg_data_next(&msg->b, &page_offset, + &length, &last_piece); #endif } else { page = zero_page; } - if (!use_cursor) - length = min_t(int, max_write - msg_pos->page_pos, + if (!use_cursor) { + length = min_t(int, PAGE_SIZE - msg_pos->page_pos, total_max_write); - page_offset = msg_pos->page_pos + bio_offset; + page_offset = msg_pos->page_pos; + } if (do_datacrc && !msg_pos->did_page_crc) { u32 crc = le32_to_cpu(msg->footer.data_crc); -- cgit v1.2.3 From e766d7b55e10f93c7bab298135a4e90dcc46620d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Mar 2013 15:38:28 -0600 Subject: libceph: implement pages array cursor Implement and use cursor routines for page array message data items for outbound message data. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 6 +++ net/ceph/messenger.c | 93 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 76b4645e2dff..b53b9ef65009 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -105,6 +105,12 @@ struct ceph_msg_data_cursor { unsigned int vector_offset; /* bytes from vector */ }; #endif /* CONFIG_BLOCK */ + struct { /* pages */ + size_t resid; /* bytes from array */ + unsigned int page_offset; /* offset in page */ + unsigned short page_index; /* index in array */ + unsigned short page_count; /* pages in array */ + }; struct { /* pagelist */ struct page *page; /* page from list */ size_t offset; /* bytes from list */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 209990a853e5..d611156808b3 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -830,6 +830,79 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes) } #endif +/* + * For a page array, a piece comes from the first page in the array + * that has not already been fully consumed. + */ +static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data) +{ + struct ceph_msg_data_cursor *cursor = &data->cursor; + int page_count; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(!data->pages); + BUG_ON(!data->length); + + page_count = calc_pages_for(data->alignment, (u64)data->length); + BUG_ON(page_count > (int) USHRT_MAX); + cursor->resid = data->length; + cursor->page_offset = data->alignment & ~PAGE_MASK; + cursor->page_index = 0; + cursor->page_count = (unsigned short) page_count; + cursor->last_piece = cursor->page_count == 1; +} + +static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data, + size_t *page_offset, + size_t *length) +{ + struct ceph_msg_data_cursor *cursor = &data->cursor; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_index >= cursor->page_count); + BUG_ON(cursor->page_offset >= PAGE_SIZE); + BUG_ON(!cursor->resid); + + *page_offset = cursor->page_offset; + if (cursor->last_piece) { + BUG_ON(*page_offset + cursor->resid > PAGE_SIZE); + *length = cursor->resid; + } else { + *length = PAGE_SIZE - *page_offset; + } + + return data->pages[cursor->page_index]; +} + +static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data, + size_t bytes) +{ + struct ceph_msg_data_cursor *cursor = &data->cursor; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); + BUG_ON(bytes > cursor->resid); + + /* Advance the cursor page offset */ + + cursor->resid -= bytes; + cursor->page_offset += bytes; + if (!bytes || cursor->page_offset & ~PAGE_MASK) + return false; /* more bytes to process in the current page */ + + /* Move on to the next page */ + + BUG_ON(cursor->page_index >= cursor->page_count); + cursor->page_offset = 0; + cursor->page_index++; + cursor->last_piece = cursor->page_index == cursor->page_count - 1; + + return true; +} + /* * For a pagelist, a piece is whatever remains to be consumed in the * first page in the list, or the front of the next page. @@ -932,13 +1005,15 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data) case CEPH_MSG_DATA_PAGELIST: ceph_msg_data_pagelist_cursor_init(data); break; + case CEPH_MSG_DATA_PAGES: + ceph_msg_data_pages_cursor_init(data); + break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: ceph_msg_data_bio_cursor_init(data); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_NONE: - case CEPH_MSG_DATA_PAGES: default: /* BUG(); */ break; @@ -961,13 +1036,15 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data, case CEPH_MSG_DATA_PAGELIST: page = ceph_msg_data_pagelist_next(data, page_offset, length); break; + case CEPH_MSG_DATA_PAGES: + page = ceph_msg_data_pages_next(data, page_offset, length); + break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: page = ceph_msg_data_bio_next(data, page_offset, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_NONE: - case CEPH_MSG_DATA_PAGES: default: page = NULL; break; @@ -993,13 +1070,15 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) case CEPH_MSG_DATA_PAGELIST: new_piece = ceph_msg_data_pagelist_advance(data, bytes); break; + case CEPH_MSG_DATA_PAGES: + new_piece = ceph_msg_data_pages_advance(data, bytes); + break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: new_piece = ceph_msg_data_bio_advance(data, bytes); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_NONE: - case CEPH_MSG_DATA_PAGES: default: BUG(); break; @@ -1032,6 +1111,8 @@ static void prepare_message_data(struct ceph_msg *msg, if (ceph_msg_has_bio(msg)) ceph_msg_data_cursor_init(&msg->b); #endif /* CONFIG_BLOCK */ + if (ceph_msg_has_pages(msg)) + ceph_msg_data_cursor_init(&msg->p); if (ceph_msg_has_pagelist(msg)) ceph_msg_data_cursor_init(&msg->l); if (ceph_msg_has_trail(msg)) @@ -1330,6 +1411,8 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->page_pos += sent; if (in_trail) need_crc = ceph_msg_data_advance(&msg->t, sent); + else if (ceph_msg_has_pages(msg)) + need_crc = ceph_msg_data_advance(&msg->p, sent); else if (ceph_msg_has_pagelist(msg)) need_crc = ceph_msg_data_advance(&msg->l, sent); #ifdef CONFIG_BLOCK @@ -1435,7 +1518,9 @@ static int write_partial_message_data(struct ceph_connection *con) page = ceph_msg_data_next(&msg->t, &page_offset, &length, &last_piece); } else if (ceph_msg_has_pages(msg)) { - page = msg->p.pages[msg_pos->page]; + use_cursor = true; + page = ceph_msg_data_next(&msg->p, &page_offset, + &length, &last_piece); } else if (ceph_msg_has_pagelist(msg)) { use_cursor = true; page = ceph_msg_data_next(&msg->l, &page_offset, -- cgit v1.2.3 From 175face2ba31025b0dcd6da4e711fca7764287fa Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: let osd ops determine request data length The length of outgoing data in an osd request is dependent on the osd ops that are embedded in that request. Each op is encoded into a request message using osd_req_encode_op(), so that should be used to determine the amount of outgoing data implied by the op as it is encoded. Have osd_req_encode_op() return the number of bytes of outgoing data implied by the op being encoded, and accumulate and use that in ceph_osdc_build_request(). As a result, ceph_osdc_build_request() no longer requires its "len" parameter, so get rid of it. Using the sum of the op lengths rather than the length provided is a valid change because: - The only callers of osd ceph_osdc_build_request() are rbd and the osd client (in ceph_osdc_new_request() on behalf of the file system). - When rbd calls it, the length provided is only non-zero for write requests, and in that case the single op has the same length value as what was passed here. - When called from ceph_osdc_new_request(), (it's not all that easy to see, but) the length passed is also always the same as the extent length encoded in its (single) write op if present. This resolves: http://tracker.ceph.com/issues/4406 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 3 +-- net/ceph/osd_client.c | 33 +++++++++++++++++++-------------- 3 files changed, 21 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 04cd5fdfc8f3..dea4401c4f77 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1462,7 +1462,7 @@ static struct ceph_osd_request *rbd_osd_req_create( /* osd_req will get its own reference to snapc (if non-null) */ - ceph_osdc_build_request(osd_req, offset, length, 1, op, + ceph_osdc_build_request(osd_req, offset, 1, op, snapc, snap_id, mtime); return osd_req; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index a8016dfbfdba..bcf3f72ec3f8 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -249,8 +249,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * bool use_mempool, gfp_t gfp_flags); -extern void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, +extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 37d89614a61b..ce34faaa453f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -222,10 +222,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); -static void osd_req_encode_op(struct ceph_osd_request *req, +static u64 osd_req_encode_op(struct ceph_osd_request *req, struct ceph_osd_op *dst, struct ceph_osd_req_op *src) { + u64 out_data_len = 0; + u64 tmp; + dst->op = cpu_to_le16(src->op); switch (src->op) { @@ -233,10 +236,10 @@ static void osd_req_encode_op(struct ceph_osd_request *req, break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: - dst->extent.offset = - cpu_to_le64(src->extent.offset); - dst->extent.length = - cpu_to_le64(src->extent.length); + if (src->op == CEPH_OSD_OP_WRITE) + out_data_len = src->extent.length; + dst->extent.offset = cpu_to_le64(src->extent.offset); + dst->extent.length = cpu_to_le64(src->extent.length); dst->extent.truncate_size = cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = @@ -247,12 +250,14 @@ static void osd_req_encode_op(struct ceph_osd_request *req, dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); + tmp = req->r_trail.length; ceph_pagelist_append(&req->r_trail, src->cls.class_name, src->cls.class_len); ceph_pagelist_append(&req->r_trail, src->cls.method_name, src->cls.method_len); ceph_pagelist_append(&req->r_trail, src->cls.indata, src->cls.indata_len); + out_data_len = req->r_trail.length - tmp; break; case CEPH_OSD_OP_STARTSYNC: break; @@ -326,6 +331,8 @@ static void osd_req_encode_op(struct ceph_osd_request *req, break; } dst->payload_len = cpu_to_le32(src->payload_len); + + return out_data_len; } /* @@ -333,7 +340,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, * */ void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, unsigned int num_ops, + u64 off, unsigned int num_ops, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime) @@ -385,12 +392,13 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); p += req->r_oid_len; - /* ops */ + /* ops--can imply data */ ceph_encode_16(&p, num_ops); src_op = src_ops; req->r_request_ops = p; + data_len = 0; for (i = 0; i < num_ops; i++, src_op++) { - osd_req_encode_op(req, p, src_op); + data_len += osd_req_encode_op(req, p, src_op); p += sizeof(struct ceph_osd_op); } @@ -407,11 +415,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, req->r_request_attempts = p; p += 4; - data_len = req->r_trail.length; - if (flags & CEPH_OSD_FLAG_WRITE) { + /* data */ + if (flags & CEPH_OSD_FLAG_WRITE) req->r_request->hdr.data_off = cpu_to_le16(off); - data_len += len; - } req->r_request->hdr.data_len = cpu_to_le32(data_len); BUG_ON(p > msg->front.iov_base + msg->front.iov_len); @@ -477,13 +483,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, ceph_osdc_put_request(req); return ERR_PTR(r); } - req->r_file_layout = *layout; /* keep a copy */ snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); - ceph_osdc_build_request(req, off, *plen, num_op, ops, + ceph_osdc_build_request(req, off, num_op, ops, snapc, vino.snap, mtime); return req; -- cgit v1.2.3 From 9a5e6d09ddd0cd68ce64c3aa54095e4a0e85b089 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: have osd requests support pagelist data Add support for recording a ceph pagelist as data associated with an osd request. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 4 +++- net/ceph/osd_client.c | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index bcf3f72ec3f8..cf0ba93426da 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -53,6 +53,7 @@ struct ceph_osd { enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE, CEPH_OSD_DATA_TYPE_PAGES, + CEPH_OSD_DATA_TYPE_PAGELIST, #ifdef CONFIG_BLOCK CEPH_OSD_DATA_TYPE_BIO, #endif /* CONFIG_BLOCK */ @@ -68,8 +69,9 @@ struct ceph_osd_data { bool pages_from_pool; bool own_pages; }; + struct ceph_pagelist *pagelist; #ifdef CONFIG_BLOCK - struct bio *bio; + struct bio *bio; #endif /* CONFIG_BLOCK */ }; }; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ce34faaa453f..4159df2d67af 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1757,6 +1757,9 @@ static void ceph_osdc_msg_data_set(struct ceph_msg *msg, if (osd_data->length) ceph_msg_data_set_pages(msg, osd_data->pages, osd_data->length, osd_data->alignment); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { + BUG_ON(!osd_data->pagelist->length); + ceph_msg_data_set_pagelist(msg, osd_data->pagelist); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { ceph_msg_data_set_bio(msg, osd_data->bio); -- cgit v1.2.3 From 95e072eb38f99c724739d91a1f12bb8bfe1619b5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: kill osd request r_trail The osd trail is a pagelist, used only for a CALL osd operation to hold the class and method names, along with any input data for the call. It is only currently used by the rbd client, and when it's used it is the only bit of outbound data in the osd request. Since we already support (non-trail) pagelist data in a message, we can just save this outbound CALL data in the "normal" pagelist rather than the trail, and get rid of the trail entirely. The existing pagelist support depends on the pagelist being dynamically allocated, and ownership of it is passed to the messenger once it's been attached to a message. (That is to say, the messenger releases and frees the pagelist when it's done with it). That means we need to dynamically allocate the pagelist also. Note that we simply assert that the allocation of a pagelist structure succeeds. Appending to a pagelist might require a dynamic allocation, so we're already assuming we won't run into trouble doing so (we're just ignore any failures--and that should be fixed at some point). This resolves: http://tracker.ceph.com/issues/4407 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 - net/ceph/osd_client.c | 23 ++++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cf0ba93426da..1dab291b2dc6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -134,7 +134,6 @@ struct ceph_osd_request { struct ceph_osd_data r_data_in; struct ceph_osd_data r_data_out; - struct ceph_pagelist r_trail; /* trailing part of data out */ }; struct ceph_osd_event { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4159df2d67af..cb14db8496bd 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -138,7 +138,6 @@ void ceph_osdc_release_request(struct kref *kref) } ceph_put_snap_context(req->r_snapc); - ceph_pagelist_release(&req->r_trail); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else @@ -202,7 +201,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req->r_data_in.type = CEPH_OSD_DATA_TYPE_NONE; req->r_data_out.type = CEPH_OSD_DATA_TYPE_NONE; - ceph_pagelist_init(&req->r_trail); /* create request message; allow space for oid */ if (use_mempool) @@ -227,7 +225,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, struct ceph_osd_req_op *src) { u64 out_data_len = 0; - u64 tmp; + struct ceph_pagelist *pagelist; dst->op = cpu_to_le16(src->op); @@ -246,18 +244,23 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, cpu_to_le32(src->extent.truncate_seq); break; case CEPH_OSD_OP_CALL: + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + BUG_ON(!pagelist); + ceph_pagelist_init(pagelist); + dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); - - tmp = req->r_trail.length; - ceph_pagelist_append(&req->r_trail, src->cls.class_name, + ceph_pagelist_append(pagelist, src->cls.class_name, src->cls.class_len); - ceph_pagelist_append(&req->r_trail, src->cls.method_name, + ceph_pagelist_append(pagelist, src->cls.method_name, src->cls.method_len); - ceph_pagelist_append(&req->r_trail, src->cls.indata, + ceph_pagelist_append(pagelist, src->cls.indata, src->cls.indata_len); - out_data_len = req->r_trail.length - tmp; + + req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGELIST; + req->r_data_out.pagelist = pagelist; + out_data_len = pagelist->length; break; case CEPH_OSD_OP_STARTSYNC: break; @@ -1782,8 +1785,6 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, ceph_osdc_msg_data_set(req->r_reply, &req->r_data_in); ceph_osdc_msg_data_set(req->r_request, &req->r_data_out); - if (req->r_trail.length) - ceph_msg_data_set_trail(req->r_request, &req->r_trail); register_request(osdc, req); -- cgit v1.2.3 From 9d2a06c2750177dca5f8d0e89884c1d409d64bbc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: kill message trail The wart that is the ceph message trail can now be removed, because its only user was the osd client, and the previous patch made that no longer the case. The result allows write_partial_msg_pages() to be simplified considerably. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 4 ---- net/ceph/messenger.c | 44 +++++------------------------------------- 2 files changed, 5 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index b53b9ef65009..0e4536cc46f0 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -69,7 +69,6 @@ struct ceph_messenger { #ifdef CONFIG_BLOCK #define ceph_msg_has_bio(m) ((m)->b.type == CEPH_MSG_DATA_BIO) #endif /* CONFIG_BLOCK */ -#define ceph_msg_has_trail(m) ((m)->t.type == CEPH_MSG_DATA_PAGELIST) enum ceph_msg_data_type { CEPH_MSG_DATA_NONE, /* message contains no data payload */ @@ -155,7 +154,6 @@ struct ceph_msg { #ifdef CONFIG_BLOCK struct ceph_msg_data b; /* bio */ #endif /* CONFIG_BLOCK */ - struct ceph_msg_data t; /* trail */ struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ @@ -295,8 +293,6 @@ extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio); -extern void ceph_msg_data_set_trail(struct ceph_msg *msg, - struct ceph_pagelist *trail); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d611156808b3..ff58d3182754 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1115,8 +1115,6 @@ static void prepare_message_data(struct ceph_msg *msg, ceph_msg_data_cursor_init(&msg->p); if (ceph_msg_has_pagelist(msg)) ceph_msg_data_cursor_init(&msg->l); - if (ceph_msg_has_trail(msg)) - ceph_msg_data_cursor_init(&msg->t); msg_pos->did_page_crc = false; } @@ -1398,7 +1396,7 @@ out: } static void out_msg_pos_next(struct ceph_connection *con, struct page *page, - size_t len, size_t sent, bool in_trail) + size_t len, size_t sent) { struct ceph_msg *msg = con->out_msg; struct ceph_msg_pos *msg_pos = &con->out_msg_pos; @@ -1409,9 +1407,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->data_pos += sent; msg_pos->page_pos += sent; - if (in_trail) - need_crc = ceph_msg_data_advance(&msg->t, sent); - else if (ceph_msg_has_pages(msg)) + if (ceph_msg_has_pages(msg)) need_crc = ceph_msg_data_advance(&msg->p, sent); else if (ceph_msg_has_pagelist(msg)) need_crc = ceph_msg_data_advance(&msg->l, sent); @@ -1481,14 +1477,6 @@ static int write_partial_message_data(struct ceph_connection *con) bool do_datacrc = !con->msgr->nocrc; int ret; int total_max_write; - bool in_trail = false; - size_t trail_len = 0; - size_t trail_off = data_len; - - if (ceph_msg_has_trail(msg)) { - trail_len = msg->t.pagelist->length; - trail_off -= trail_len; - } dout("%s %p msg %p page %d offset %d\n", __func__, con, msg, msg_pos->page, msg_pos->page_pos); @@ -1508,16 +1496,9 @@ static int write_partial_message_data(struct ceph_connection *con) bool use_cursor = false; bool last_piece = true; /* preserve existing behavior */ - in_trail = in_trail || msg_pos->data_pos >= trail_off; - if (!in_trail) - total_max_write = trail_off - msg_pos->data_pos; + total_max_write = data_len - msg_pos->data_pos; - if (in_trail) { - BUG_ON(!ceph_msg_has_trail(msg)); - use_cursor = true; - page = ceph_msg_data_next(&msg->t, &page_offset, - &length, &last_piece); - } else if (ceph_msg_has_pages(msg)) { + if (ceph_msg_has_pages(msg)) { use_cursor = true; page = ceph_msg_data_next(&msg->p, &page_offset, &length, &last_piece); @@ -1552,7 +1533,7 @@ static int write_partial_message_data(struct ceph_connection *con) if (ret <= 0) goto out; - out_msg_pos_next(con, page, length, (size_t) ret, in_trail); + out_msg_pos_next(con, page, length, (size_t) ret); } dout("%s %p msg %p done\n", __func__, con, msg); @@ -3145,17 +3126,6 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio) } EXPORT_SYMBOL(ceph_msg_data_set_bio); -void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail) -{ - BUG_ON(!trail); - BUG_ON(!trail->length); - BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE); - - msg->t.type = CEPH_MSG_DATA_PAGELIST; - msg->t.pagelist = trail; -} -EXPORT_SYMBOL(ceph_msg_data_set_trail); - /* * construct a new message with given type, size * the new msg has a ref count of 1. @@ -3179,7 +3149,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, ceph_msg_data_init(&m->p); ceph_msg_data_init(&m->l); ceph_msg_data_init(&m->b); - ceph_msg_data_init(&m->t); /* front */ m->front_max = front_len; @@ -3345,9 +3314,6 @@ void ceph_msg_last_put(struct kref *kref) m->l.pagelist = NULL; } - if (ceph_msg_has_trail(m)) - m->t.pagelist = NULL; - if (m->pool) ceph_msgpool_put(m->pool, m); else -- cgit v1.2.3 From 8a166d05369f6a0369bb194a795e6e3928ac6e34 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: more cleanup of write_partial_msg_pages() Basically all cases in write_partial_msg_pages() use the cursor, and as a result we can simplify that function quite a bit. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ff58d3182754..997daccf973a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1476,7 +1476,6 @@ static int write_partial_message_data(struct ceph_connection *con) unsigned int data_len = le32_to_cpu(msg->hdr.data_len); bool do_datacrc = !con->msgr->nocrc; int ret; - int total_max_write; dout("%s %p msg %p page %d offset %d\n", __func__, con, msg, msg_pos->page, msg_pos->page_pos); @@ -1490,36 +1489,30 @@ static int write_partial_message_data(struct ceph_connection *con) * been revoked, so use the zero page. */ while (data_len > msg_pos->data_pos) { - struct page *page = NULL; + struct page *page; size_t page_offset; size_t length; - bool use_cursor = false; - bool last_piece = true; /* preserve existing behavior */ - - total_max_write = data_len - msg_pos->data_pos; + bool last_piece; if (ceph_msg_has_pages(msg)) { - use_cursor = true; page = ceph_msg_data_next(&msg->p, &page_offset, &length, &last_piece); } else if (ceph_msg_has_pagelist(msg)) { - use_cursor = true; page = ceph_msg_data_next(&msg->l, &page_offset, &length, &last_piece); #ifdef CONFIG_BLOCK } else if (ceph_msg_has_bio(msg)) { - use_cursor = true; page = ceph_msg_data_next(&msg->b, &page_offset, &length, &last_piece); #endif } else { - page = zero_page; - } - if (!use_cursor) { - length = min_t(int, PAGE_SIZE - msg_pos->page_pos, - total_max_write); + size_t resid = data_len - msg_pos->data_pos; + page = zero_page; page_offset = msg_pos->page_pos; + length = PAGE_SIZE - page_offset; + length = min(resid, length); + last_piece = length == resid; } if (do_datacrc && !msg_pos->did_page_crc) { u32 crc = le32_to_cpu(msg->footer.data_crc); -- cgit v1.2.3 From 3a23083bda56850a1dc0e1c6d270b1f5dc789f07 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 08:47:40 -0700 Subject: libceph: implement RECONNECT_SEQ feature This is an old protocol extension that allows the client and server to avoid resending old messages after a reconnect (following a socket error). Instead, the exchange their sequence numbers during the handshake. This avoids sending a bunch of useless data over the socket. It has been supported in the server code since v0.22 (Sep 2010). Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/ceph_features.h | 2 ++ include/linux/ceph/msgr.h | 1 + net/ceph/messenger.c | 43 +++++++++++++++++++++++++++++++++----- 3 files changed, 41 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 76554cecaab2..4c42080347af 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -41,6 +41,7 @@ */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC | \ @@ -51,6 +52,7 @@ #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC) diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h index 680d3d648cac..3d94a73b5f30 100644 --- a/include/linux/ceph/msgr.h +++ b/include/linux/ceph/msgr.h @@ -87,6 +87,7 @@ struct ceph_entity_inst { #define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ #define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ +#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ /* diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 997daccf973a..e8491db43f5e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1246,6 +1246,24 @@ static void prepare_write_ack(struct ceph_connection *con) con_flag_set(con, CON_FLAG_WRITE_PENDING); } +/* + * Prepare to share the seq during handshake + */ +static void prepare_write_seq(struct ceph_connection *con) +{ + dout("prepare_write_seq %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); + + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + /* * Prepare to write keepalive byte. */ @@ -1582,6 +1600,13 @@ static void prepare_read_ack(struct ceph_connection *con) con->in_base_pos = 0; } +static void prepare_read_seq(struct ceph_connection *con) +{ + dout("prepare_read_seq %p\n", con); + con->in_base_pos = 0; + con->in_tag = CEPH_MSGR_TAG_SEQ; +} + static void prepare_read_tag(struct ceph_connection *con) { dout("prepare_read_tag %p\n", con); @@ -2059,6 +2084,7 @@ static int process_connect(struct ceph_connection *con) prepare_read_connect(con); break; + case CEPH_MSGR_TAG_SEQ: case CEPH_MSGR_TAG_READY: if (req_feat & ~server_feat) { pr_err("%s%lld %s protocol feature mismatch," @@ -2089,7 +2115,12 @@ static int process_connect(struct ceph_connection *con) con->delay = 0; /* reset backoff memory */ - prepare_read_tag(con); + if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { + prepare_write_seq(con); + prepare_read_seq(con); + } else { + prepare_read_tag(con); + } break; case CEPH_MSGR_TAG_WAIT: @@ -2123,7 +2154,6 @@ static int read_partial_ack(struct ceph_connection *con) return read_partial(con, end, size, &con->in_temp_ack); } - /* * We can finally discard anything that's been acked. */ @@ -2148,8 +2178,6 @@ static void process_ack(struct ceph_connection *con) } - - static int read_partial_message_section(struct ceph_connection *con, struct kvec *section, unsigned int sec_len, u32 *crc) @@ -2672,7 +2700,12 @@ more: prepare_read_tag(con); goto more; } - if (con->in_tag == CEPH_MSGR_TAG_ACK) { + if (con->in_tag == CEPH_MSGR_TAG_ACK || + con->in_tag == CEPH_MSGR_TAG_SEQ) { + /* + * the final handshake seq exchange is semantically + * equivalent to an ACK + */ ret = read_partial_ack(con); if (ret <= 0) goto out; -- cgit v1.2.3 From 20e55c4cc758e4dccdfd92ae8e9588dd624b2cd7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 09:30:13 -0700 Subject: libceph: clear messenger auth_retry flag when we authenticate We maintain a counter of failed auth attempts to allow us to retry once before failing. However, if the second attempt succeeds, the flag isn't cleared, which makes us think auth failed again later when the connection resets for other reasons (like a socket error). This is one part of the sorry sequence of events in bug http://tracker.ceph.com/issues/4282 Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/messenger.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e8491db43f5e..2aecc4896a03 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2013,7 +2013,6 @@ static int process_connect(struct ceph_connection *con) con->error_msg = "connect authorization failure"; return -1; } - con->auth_retry = 1; con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) @@ -2099,7 +2098,7 @@ static int process_connect(struct ceph_connection *con) WARN_ON(con->state != CON_STATE_NEGOTIATING); con->state = CON_STATE_OPEN; - + con->auth_retry = 0; /* we authenticated; clear flag */ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; con->peer_features = server_feat; -- cgit v1.2.3 From 4b8e8b5d78b8322351d44487c1b76f7e9d3412bc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 10:25:49 -0700 Subject: libceph: fix authorizer invalidation We were invalidating the authorizer by removing the ticket handler entirely. This was effective in inducing us to request a new authorizer, but in the meantime it mean that any authorizer we generated would get a new and initialized handler with secret_id=0, which would always be rejected by the server side with a confusing error message: auth: could not find secret_id=0 cephx: verify_authorizer could not get service secret for service osd secret_id=0 Instead, simply clear the validity field. This will still induce the auth code to request a new secret, but will let us continue to use the old ticket in the meantime. The messenger code will probably continue to fail, but the exponential backoff will kick in, and eventually the we will get a new (hopefully more valid) ticket from the mon and be able to continue. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/auth_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index a16bf14eb027..bd8758dbfded 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -630,7 +630,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, th = get_ticket_handler(ac, peer_type); if (!IS_ERR(th)) - remove_ticket_handler(ac, th); + memset(&th->validity, 0, sizeof(th->validity)); } -- cgit v1.2.3 From 0bed9b5c523d577378b6f83eab5835fe30c27208 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 10:26:01 -0700 Subject: libceph: add update_authorizer auth method Currently the messenger calls out to a get_authorizer con op, which will create a new authorizer if it doesn't yet have one. In the meantime, when we rotate our service keys, the authorizer doesn't get updated. Eventually it will be rejected by the server on a new connection attempt and get invalidated, and we will then rebuild a new authorizer, but this is not ideal. Instead, if we do have an authorizer, call a new update_authorizer op that will verify that the current authorizer is using the latest secret. If it is not, we will build a new one that does. This avoids the transient failure. This fixes one of the sorry sequence of events for bug http://tracker.ceph.com/issues/4282 Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- fs/ceph/mds_client.c | 7 ++++++- include/linux/ceph/auth.h | 3 +++ net/ceph/auth_x.c | 23 +++++++++++++++++++++++ net/ceph/auth_x.h | 1 + net/ceph/osd_client.c | 5 +++++ 5 files changed, 38 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0db6f5206d11..010ff83d640b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3445,7 +3445,12 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, } if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); + auth); + if (ret) + return ERR_PTR(ret); + } else if (ac->ops && ac->ops_update_authorizer) { + int ret = ac->ops->update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); if (ret) return ERR_PTR(ret); } diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index d4080f309b56..73e973e70026 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -52,6 +52,9 @@ struct ceph_auth_client_ops { */ int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth); + /* ensure that an existing authorizer is up to date */ + int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len); void (*destroy_authorizer)(struct ceph_auth_client *ac, diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index bd8758dbfded..2d5981555cd6 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, return -ENOMEM; } au->service = th->service; + au->secret_id = th->secret_id; msg_a = au->buf->vec.iov_base; msg_a->struct_v = 1; @@ -555,6 +556,27 @@ static int ceph_x_create_authorizer( return 0; } +static int ceph_x_update_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth) +{ + struct ceph_x_authorizer *au; + struct ceph_x_ticket_handler *th; + int ret; + + th = get_ticket_handler(ac, peer_type); + if (IS_ERR(th)) + return PTR_ERR(th); + + au = (struct ceph_x_authorizer *)auth->authorizer; + if (au->secret_id < th->secret_id) { + dout("ceph_x_update_authorizer service %u secret %llu < %llu\n", + au->service, au->secret_id, th->secret_id); + return ceph_x_build_authorizer(ac, th, au); + } + return 0; +} + static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len) { @@ -641,6 +663,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = { .build_request = ceph_x_build_request, .handle_reply = ceph_x_handle_reply, .create_authorizer = ceph_x_create_authorizer, + .update_authorizer = ceph_x_update_authorizer, .verify_authorizer_reply = ceph_x_verify_authorizer_reply, .destroy_authorizer = ceph_x_destroy_authorizer, .invalidate_authorizer = ceph_x_invalidate_authorizer, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index f459e93b774f..c5a058da7ac8 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -29,6 +29,7 @@ struct ceph_x_authorizer { struct ceph_buffer *buf; unsigned int service; u64 nonce; + u64 secret_id; char reply_buf[128]; /* big enough for encrypted blob */ }; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cb14db8496bd..5ef24e3e1627 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2220,6 +2220,11 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, auth); if (ret) return ERR_PTR(ret); + } else if (ac->ops && ac->ops->update_authorizer) { + int ret = ac->ops->update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); + if (ret) + return ERR_PTR(ret); } *proto = ac->protocol; -- cgit v1.2.3 From 27859f9773e4a0b2042435b13400ee2c891a61f4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 10:26:14 -0700 Subject: libceph: wrap auth ops in wrapper functions Use wrapper functions that check whether the auth op exists so that callers do not need a bunch of conditional checks. Simplifies the external interface. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- fs/ceph/mds_client.c | 26 ++++++++++++-------------- include/linux/ceph/auth.h | 13 +++++++++++++ net/ceph/auth.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ net/ceph/auth_x.c | 1 - net/ceph/mon_client.c | 7 +++---- net/ceph/osd_client.c | 26 +++++++++----------------- 6 files changed, 84 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 010ff83d640b..13ae44eaa980 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -365,9 +365,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) - s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( - s->s_mdsc->fsc->client->monc.auth, - s->s_auth.authorizer); + ceph_auth_destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_auth.authorizer); kfree(s); } } @@ -3439,18 +3439,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &s->s_auth; if (force_new && auth->authorizer) { - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(ac, auth->authorizer); auth->authorizer = NULL; } - if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { - int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); if (ret) return ERR_PTR(ret); - } else if (ac->ops && ac->ops_update_authorizer) { - int ret = ac->ops->update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); if (ret) return ERR_PTR(ret); } @@ -3466,7 +3465,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -3475,8 +3474,7 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - if (ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index 73e973e70026..c9c3b3abe4a3 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -97,5 +97,18 @@ extern int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len); extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); +extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *auth); +extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a); +extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *a); +extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + size_t len); +extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type); #endif diff --git a/net/ceph/auth.c b/net/ceph/auth.c index b4bf4ac090f1..a22de543cedb 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -257,3 +257,50 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac) return 0; return ac->ops->is_authenticated(ac); } +EXPORT_SYMBOL(ceph_auth_is_authenticated); + +int ceph_auth_create_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *auth) +{ + if (ac->ops && ac->ops->create_authorizer) + return ac->ops->create_authorizer(ac, peer_type, auth); + return 0; +} +EXPORT_SYMBOL(ceph_auth_create_authorizer); + +void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + if (ac->ops && ac->ops->destroy_authorizer) + ac->ops->destroy_authorizer(ac, a); +} +EXPORT_SYMBOL(ceph_auth_destroy_authorizer); + +int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *a) +{ + int ret = 0; + + if (ac->ops && ac->ops->update_authorizer) + ret = ac->ops->update_authorizer(ac, peer_type, a); + return ret; +} +EXPORT_SYMBOL(ceph_auth_update_authorizer); + +int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len) +{ + if (ac->ops && ac->ops->verify_authorizer_reply) + return ac->ops->verify_authorizer_reply(ac, a, len); + return 0; +} +EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply); + +void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) +{ + if (ac->ops && ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, peer_type); +} +EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 2d5981555cd6..96238ba95f2b 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -562,7 +562,6 @@ static int ceph_x_update_authorizer( { struct ceph_x_authorizer *au; struct ceph_x_ticket_handler *th; - int ret; th = get_ticket_handler(ac, peer_type); if (IS_ERR(th)) diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index aef5b1062bee..1fe25cd29d0e 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -737,7 +737,7 @@ static void delayed_work(struct work_struct *work) __validate_auth(monc); - if (monc->auth->ops->is_authenticated(monc->auth)) + if (ceph_auth_is_authenticated(monc->auth)) __send_subscribe(monc); } __schedule_delayed(monc); @@ -892,8 +892,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); had_debugfs_info = have_debugfs_info(monc); - if (monc->auth->ops) - was_auth = monc->auth->ops->is_authenticated(monc->auth); + was_auth = ceph_auth_is_authenticated(monc->auth); monc->pending_auth = 0; ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, @@ -904,7 +903,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, wake_up_all(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); - } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { + } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5ef24e3e1627..7041906a55a6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -666,8 +666,7 @@ static void put_osd(struct ceph_osd *osd) if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } } @@ -2211,17 +2210,16 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &o->o_auth; if (force_new && auth->authorizer) { - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(ac, auth->authorizer); auth->authorizer = NULL; } - if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { - int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); if (ret) return ERR_PTR(ret); - } else if (ac->ops && ac->ops->update_authorizer) { - int ret = ac->ops->update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, auth); if (ret) return ERR_PTR(ret); @@ -2238,11 +2236,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - /* - * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, - * XXX which do we do: succeed or fail? - */ - return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -2251,9 +2245,7 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - if (ac->ops && ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); - + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); return ceph_monc_validate_auth(&osdc->client->monc); } -- cgit v1.2.3 From e9966076cdd952e19f2dd4854cd719be0d7cbebc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 10:26:30 -0700 Subject: libceph: wrap auth methods in a mutex The auth code is called from a variety of contexts, include the mon_client (protected by the monc's mutex) and the messenger callbacks (currently protected by nothing). Avoid chaos by protecting all auth state with a mutex. Nothing is blocking, so this should be simple and lightweight. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/auth.h | 2 ++ net/ceph/auth.c | 78 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 58 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index c9c3b3abe4a3..5f3386844134 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -78,6 +78,8 @@ struct ceph_auth_client { u64 global_id; /* our unique id in system */ const struct ceph_crypto_key *key; /* our secret key */ unsigned want_keys; /* which services we want */ + + struct mutex mutex; }; extern struct ceph_auth_client *ceph_auth_init(const char *name, diff --git a/net/ceph/auth.c b/net/ceph/auth.c index a22de543cedb..6b923bcaa2a4 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -47,6 +47,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp if (!ac) goto out; + mutex_init(&ac->mutex); ac->negotiating = true; if (name) ac->name = name; @@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac) */ void ceph_auth_reset(struct ceph_auth_client *ac) { + mutex_lock(&ac->mutex); dout("auth_reset %p\n", ac); if (ac->ops && !ac->negotiating) ac->ops->reset(ac); ac->negotiating = true; + mutex_unlock(&ac->mutex); } int ceph_entity_name_encode(const char *name, void **p, void *end) @@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) int i, num; int ret; + mutex_lock(&ac->mutex); dout("auth_build_hello\n"); monhdr->have_version = 0; monhdr->session_mon = cpu_to_le16(-1); @@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) ret = ceph_entity_name_encode(ac->name, &p, end); if (ret < 0) - return ret; + goto out; ceph_decode_need(&p, end, sizeof(u64), bad); ceph_encode_64(&p, ac->global_id); ceph_encode_32(&lenp, p - lenp - sizeof(u32)); - return p - buf; + ret = p - buf; +out: + mutex_unlock(&ac->mutex); + return ret; bad: - return -ERANGE; + ret = -ERANGE; + goto out; } static int ceph_build_auth_request(struct ceph_auth_client *ac, @@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac, if (ret < 0) { pr_err("error %d building auth method %s request\n", ret, ac->ops->name); - return ret; + goto out; } dout(" built request %d bytes\n", ret); ceph_encode_32(&p, ret); - return p + ret - msg_buf; + ret = p + ret - msg_buf; +out: + return ret; } /* @@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, int result_msg_len; int ret = -EINVAL; + mutex_lock(&ac->mutex); dout("handle_auth_reply %p %p\n", p, end); ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); protocol = ceph_decode_32(&p); @@ -227,35 +238,44 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ret = ac->ops->handle_reply(ac, result, payload, payload_end); if (ret == -EAGAIN) { - return ceph_build_auth_request(ac, reply_buf, reply_len); + ret = ceph_build_auth_request(ac, reply_buf, reply_len); } else if (ret) { pr_err("auth method '%s' error %d\n", ac->ops->name, ret); - return ret; } - return 0; -bad: - pr_err("failed to decode auth msg\n"); out: + mutex_unlock(&ac->mutex); return ret; + +bad: + pr_err("failed to decode auth msg\n"); + ret = -EINVAL; + goto out; } int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len) { + int ret = 0; + + mutex_lock(&ac->mutex); if (!ac->protocol) - return ceph_auth_build_hello(ac, msg_buf, msg_len); - BUG_ON(!ac->ops); - if (ac->ops->should_authenticate(ac)) - return ceph_build_auth_request(ac, msg_buf, msg_len); - return 0; + ret = ceph_auth_build_hello(ac, msg_buf, msg_len); + else if (ac->ops->should_authenticate(ac)) + ret = ceph_build_auth_request(ac, msg_buf, msg_len); + mutex_unlock(&ac->mutex); + return ret; } int ceph_auth_is_authenticated(struct ceph_auth_client *ac) { - if (!ac->ops) - return 0; - return ac->ops->is_authenticated(ac); + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops) + ret = ac->ops->is_authenticated(ac); + mutex_unlock(&ac->mutex); + return ret; } EXPORT_SYMBOL(ceph_auth_is_authenticated); @@ -263,17 +283,23 @@ int ceph_auth_create_authorizer(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth) { + int ret = 0; + + mutex_lock(&ac->mutex); if (ac->ops && ac->ops->create_authorizer) - return ac->ops->create_authorizer(ac, peer_type, auth); - return 0; + ret = ac->ops->create_authorizer(ac, peer_type, auth); + mutex_unlock(&ac->mutex); + return ret; } EXPORT_SYMBOL(ceph_auth_create_authorizer); void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, struct ceph_authorizer *a) { + mutex_lock(&ac->mutex); if (ac->ops && ac->ops->destroy_authorizer) ac->ops->destroy_authorizer(ac, a); + mutex_unlock(&ac->mutex); } EXPORT_SYMBOL(ceph_auth_destroy_authorizer); @@ -283,8 +309,10 @@ int ceph_auth_update_authorizer(struct ceph_auth_client *ac, { int ret = 0; + mutex_lock(&ac->mutex); if (ac->ops && ac->ops->update_authorizer) ret = ac->ops->update_authorizer(ac, peer_type, a); + mutex_unlock(&ac->mutex); return ret; } EXPORT_SYMBOL(ceph_auth_update_authorizer); @@ -292,15 +320,21 @@ EXPORT_SYMBOL(ceph_auth_update_authorizer); int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len) { + int ret = 0; + + mutex_lock(&ac->mutex); if (ac->ops && ac->ops->verify_authorizer_reply) - return ac->ops->verify_authorizer_reply(ac, a, len); - return 0; + ret = ac->ops->verify_authorizer_reply(ac, a, len); + mutex_unlock(&ac->mutex); + return ret; } EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply); void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) { + mutex_lock(&ac->mutex); if (ac->ops && ac->ops->invalidate_authorizer) ac->ops->invalidate_authorizer(ac, peer_type); + mutex_unlock(&ac->mutex); } EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); -- cgit v1.2.3 From dc4b870c97a5006871c259f7e61ea6c79038f731 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Mar 2013 18:16:11 -0500 Subject: libceph: slightly defer registering osd request One of the first things ceph_osdc_start_request() does is register the request. It then acquires the osd client's map semaphore and request mutex and proceeds to map and send the request. There is no reason the request has to be registered before acquiring the map semaphore. So hold off doing so until after the map semaphore is held. Since register_request() is nothing more than a wrapper around __register_request(), call the latter function instead, after acquiring the request mutex. That leaves register_request() unused, so get rid of it. This partially resolves: http://tracker.ceph.com/issues/4392 Signed-off-by: Alex Elder Reviewed-off-by: Sage Weil --- net/ceph/osd_client.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7041906a55a6..f9276cb26aa2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -831,14 +831,6 @@ static void __register_request(struct ceph_osd_client *osdc, } } -static void register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - mutex_lock(&osdc->request_mutex); - __register_request(osdc, req); - mutex_unlock(&osdc->request_mutex); -} - /* * called under osdc->request_mutex */ @@ -1785,8 +1777,6 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, ceph_osdc_msg_data_set(req->r_reply, &req->r_data_in); ceph_osdc_msg_data_set(req->r_request, &req->r_data_out); - register_request(osdc, req); - down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); /* @@ -1794,6 +1784,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, * while we dropped request_mutex above, so only send now if * the request still han't been touched yet. */ + __register_request(osdc, req); if (req->r_sent == 0) { rc = __map_request(osdc, req, 0); if (rc < 0) { -- cgit v1.2.3 From 92451b4910895936cc05ce1d283644ffc44d7537 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Mar 2013 18:16:11 -0500 Subject: libceph: no more kick_requests() race Since we no longer drop the request mutex between registering and mapping an osd request in ceph_osdc_start_request(), there is no chance of a race with kick_requests(). We can now therefore map and send the new request unconditionally (but we'll issue a warning should it ever occur). Signed-off-by: Alex Elder Reviewed-off-by: Sage Weil --- net/ceph/osd_client.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f9276cb26aa2..3723a7f16afd 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1779,31 +1779,24 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); - /* - * a racing kick_requests() may have sent the message for us - * while we dropped request_mutex above, so only send now if - * the request still han't been touched yet. - */ __register_request(osdc, req); - if (req->r_sent == 0) { - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } - goto out_unlock; - } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } else { - __send_request(osdc, req); + WARN_ON(req->r_sent); + rc = __map_request(osdc, req, 0); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; } - rc = 0; + goto out_unlock; } - + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + __send_request(osdc, req); + } + rc = 0; out_unlock: mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); -- cgit v1.2.3 From e02493c07c4cb08106d0b3a4b5003c7c005010fb Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Mar 2013 18:16:11 -0500 Subject: libceph: requeue only sent requests when kicking The osd expects incoming requests for a given object from a given client to arrive in order, with the tid for each request being greater than the tid for requests that have already arrived. This patch fixes two places the osd client might not maintain that ordering. For the osd client, the connection fault method is osd_reset(). That function calls __reset_osd() to close and re-open the connection, then calls __kick_osd_requests() to cause all outstanding requests for the affected osd to be re-sent after the connection has been re-established. When an osd is reset, any in-flight messages will need to be re-sent. An osd client maintains distinct lists for unsent and in-flight messages. Meanwhile, an osd maintains a single list of all its requests (both sent and un-sent). (Each message is linked into two lists--one for the osd client and one list for the osd.) To process an osd "kick" operation, the request list for the *osd* is traversed, and each request is moved off whichever osd *client* list it was on (unsent or sent) and placed onto the osd client's unsent list. (It remains where it is on the osd's request list.) When that is done, osd_reset() calls __send_queued() to cause each of the osd client's unsent messages to be sent. OK, with that background... As the osd request list is traversed each request is prepended to the osd client's unsent list in the order they're seen. The effect of this is to reverse the order of these requests as they are put (back) onto the unsent list. Instead, build up a list of only the requests for an osd that have already been sent (by checking their r_sent flag values). Once an unsent request is found, stop examining requests and prepend the requests that need re-sending to the osd client's unsent list. Preserve the original order of requests in the process (previously re-queued requests were reversed in this process). Because they have already been sent, they will have lower tids than any request already present on the unsent list. Just below that, traverse the linger list in forward order as before, but add them to the *tail* of the list rather than the head. These requests get re-registered, and in the process are give a new (higher) tid, so the should go at the end. This partially resolves: http://tracker.ceph.com/issues/4392 Signed-off-by: Alex Elder Reviewed-off-by: Sage Weil --- net/ceph/osd_client.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3723a7f16afd..8b84fb4980ba 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -570,21 +570,46 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd) { struct ceph_osd_request *req, *nreq; + LIST_HEAD(resend); int err; dout("__kick_osd_requests osd%d\n", osd->o_osd); err = __reset_osd(osdc, osd); if (err) return; - + /* + * Build up a list of requests to resend by traversing the + * osd's list of requests. Requests for a given object are + * sent in tid order, and that is also the order they're + * kept on this list. Therefore all requests that are in + * flight will be found first, followed by all requests that + * have not yet been sent. And to resend requests while + * preserving this order we will want to put any sent + * requests back on the front of the osd client's unsent + * list. + * + * So we build a separate ordered list of already-sent + * requests for the affected osd and splice it onto the + * front of the osd client's unsent list. Once we've seen a + * request that has not yet been sent we're done. Those + * requests are already sitting right where they belong. + */ list_for_each_entry(req, &osd->o_requests, r_osd_item) { - list_move(&req->r_req_lru_item, &osdc->req_unsent); - dout("requeued %p tid %llu osd%d\n", req, req->r_tid, + if (!req->r_sent) + break; + list_move_tail(&req->r_req_lru_item, &resend); + dout("requeueing %p tid %llu osd%d\n", req, req->r_tid, osd->o_osd); if (!req->r_linger) req->r_flags |= CEPH_OSD_FLAG_RETRY; } + list_splice(&resend, &osdc->req_unsent); + /* + * Linger requests are re-registered before sending, which + * sets up a new tid for each. We add them to the unsent + * list at the end to keep things in tid order. + */ list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, r_linger_osd) { /* @@ -593,7 +618,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, */ BUG_ON(!list_empty(&req->r_req_lru_item)); __register_request(osdc, req); - list_add(&req->r_req_lru_item, &osdc->req_unsent); + list_add_tail(&req->r_req_lru_item, &osdc->req_unsent); list_add(&req->r_osd_item, &req->r_osd->o_requests); __unregister_linger_request(osdc, req); dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, -- cgit v1.2.3 From ad885927dee2e72fbfab624c7599cb9d9352cc04 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Mar 2013 18:16:11 -0500 Subject: libceph: keep request lists in tid order In __map_request(), when adding a request to an osd client's unsent list, add it to the tail rather than the head. That way the newest entries (with the highest tid value) will be last. Maintain an osd's request list in order of increasing tid also. Finally--to be consistent--maintain an osd client's "notarget" list in that order as well. This partially resolves: http://tracker.ceph.com/issues/4392 Signed-off-by: Alex Elder Reviewed-off-by: Sage Weil --- net/ceph/osd_client.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 8b84fb4980ba..356f7bc4ae4b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -619,7 +619,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, BUG_ON(!list_empty(&req->r_req_lru_item)); __register_request(osdc, req); list_add_tail(&req->r_req_lru_item, &osdc->req_unsent); - list_add(&req->r_osd_item, &req->r_osd->o_requests); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); __unregister_linger_request(osdc, req); dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, osd->o_osd); @@ -1035,10 +1035,10 @@ static int __map_request(struct ceph_osd_client *osdc, if (req->r_osd) { __remove_osd_from_lru(req->r_osd); - list_add(&req->r_osd_item, &req->r_osd->o_requests); - list_move(&req->r_req_lru_item, &osdc->req_unsent); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); + list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); } else { - list_move(&req->r_req_lru_item, &osdc->req_notarget); + list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); } err = 1; /* osd or pg changed */ -- cgit v1.2.3 From 7e2766a1135544a2972d2767f3a41afd5f55067f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Mar 2013 18:16:11 -0500 Subject: libceph: send queued requests when starting new one An osd expects the transaction ids of arriving request messages from a given client to a given osd to increase monotonically. So the osd client needs to send its requests in ascending tid order. The transaction id for a request is set at the time it is registered, in __register_request(). This is also where the request gets placed at the end of the osd client's unsent messages list. At the end of ceph_osdc_start_request(), the request message for a newly-mapped osd request is supplied to the messenger to be sent (via __send_request()). If any other messages were present in the osd client's unsent list at that point they would be sent *after* this new request message. Because those unsent messages have already been registered, their tids would be lower than the newly-mapped request message, and sending that message first can violate the tid ordering rule. Rather than sending the new request only, send all queued requests (including the new one) at that point in ceph_osdc_start_request(). This ensures the tid ordering property is preserved. With this in place, all messages should now be sent in tid order regardless of whether they're being sent for the first time or re-sent as a result of a call to osd_reset(). This resolves: http://tracker.ceph.com/issues/4392 Signed-off-by: Alex Elder Reviewed-off-by: Sage Weil --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 356f7bc4ae4b..3b6657fe99b1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1819,7 +1819,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, dout("send_request %p no up osds in pg\n", req); ceph_monc_request_next_osdmap(&osdc->client->monc); } else { - __send_request(osdc, req); + __send_queued(osdc); } rc = 0; out_unlock: -- cgit v1.2.3 From 888334f966fab232fe9158c2c2f0a935e356b583 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Mar 2013 11:54:30 -0500 Subject: libceph: initialize data fields on last msg put When the last reference to a ceph message is dropped, ceph_msg_last_put() is called to clean things up. For "normal" messages (allocated via ceph_msg_new() rather than being allocated from a memory pool) it's sufficient to just release resources. But for a mempool-allocated message we actually have to re-initialize the data fields in the message back to initial state so they're ready to go in the event the message gets reused. Some of this was already done; this fleshes it out so it's done more completely. This resolves: http://tracker.ceph.com/issues/4540 Signed-off-by: Alex Elder Reviewed-by: Sage Weil Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2aecc4896a03..0a9f6362d4d8 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3331,12 +3331,17 @@ void ceph_msg_last_put(struct kref *kref) if (ceph_msg_has_pages(m)) { m->p.length = 0; m->p.pages = NULL; + m->p.type = CEPH_OSD_DATA_TYPE_NONE; } - if (ceph_msg_has_pagelist(m)) { ceph_pagelist_release(m->l.pagelist); kfree(m->l.pagelist); m->l.pagelist = NULL; + m->l.type = CEPH_OSD_DATA_TYPE_NONE; + } + if (ceph_msg_has_bio(m)) { + m->b.bio = NULL; + m->b.type = CEPH_OSD_DATA_TYPE_NONE; } if (m->pool) -- cgit v1.2.3 From 28a89ddece39890c255a0c41baf622731a08c288 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:22 -0500 Subject: libceph: drop pages parameter The value passed for "pages" in read_partial_message_pages() is always the pages pointer from the incoming message, which can be derived inside that function. So just get rid of the parameter. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0a9f6362d4d8..95f90b01f753 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2203,10 +2203,11 @@ static int read_partial_message_section(struct ceph_connection *con, static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); static int read_partial_message_pages(struct ceph_connection *con, - struct page **pages, unsigned int data_len, bool do_datacrc) { + struct ceph_msg *msg = con->in_msg; struct ceph_msg_pos *msg_pos = &con->in_msg_pos; + struct page **pages; struct page *page; size_t page_offset; size_t length; @@ -2214,6 +2215,7 @@ static int read_partial_message_pages(struct ceph_connection *con, int ret; /* (page) data */ + pages = msg->p.pages; BUG_ON(pages == NULL); page = pages[msg_pos->page]; page_offset = msg_pos->page_pos; @@ -2285,8 +2287,8 @@ static int read_partial_msg_data(struct ceph_connection *con) data_len = le32_to_cpu(con->in_hdr.data_len); while (msg_pos->data_pos < data_len) { if (ceph_msg_has_pages(msg)) { - ret = read_partial_message_pages(con, msg->p.pages, - data_len, do_datacrc); + ret = read_partial_message_pages(con, data_len, + do_datacrc); if (ret <= 0) return ret; #ifdef CONFIG_BLOCK -- cgit v1.2.3 From 25aff7c559c8b54a810bc094d59fe037cfed6b18 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:22 -0500 Subject: libceph: record residual bytes for all message data types All of the data types can use this, not just the page array. Until now, only the bio type doesn't have it available, and only the initiator of the request (the rbd client) is able to supply the length of the full request without re-scanning the bio list. Change the cursor init routines so the length is supplied based on the message header "data_len" field, and use that length to intiialize the "resid" field of the cursor. In addition, change the way "last_piece" is defined so it is based on the residual number of bytes in the original request. This is necessary (at least for bio messages) because it is possible for a read request to succeed without consuming all of the space available in the data buffer. This resolves: http://tracker.ceph.com/issues/4427 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 111 +++++++++++++++++++++++------------------ 2 files changed, 63 insertions(+), 50 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 0e4536cc46f0..459e55280bf8 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -95,6 +95,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) } struct ceph_msg_data_cursor { + size_t resid; /* bytes not yet consumed */ bool last_piece; /* now at last piece of data item */ union { #ifdef CONFIG_BLOCK @@ -105,7 +106,6 @@ struct ceph_msg_data_cursor { }; #endif /* CONFIG_BLOCK */ struct { /* pages */ - size_t resid; /* bytes from array */ unsigned int page_offset; /* offset in page */ unsigned short page_index; /* index in array */ unsigned short page_count; /* pages in array */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 95f90b01f753..0ac4f6cb7339 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -745,7 +745,8 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg) * entry in the current bio iovec, or the first entry in the next * bio in the list. */ -static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data) +static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data, + size_t length) { struct ceph_msg_data_cursor *cursor = &data->cursor; struct bio *bio; @@ -755,12 +756,12 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data) bio = data->bio; BUG_ON(!bio); BUG_ON(!bio->bi_vcnt); - /* resid = bio->bi_size */ + cursor->resid = length; cursor->bio = bio; cursor->vector_index = 0; cursor->vector_offset = 0; - cursor->last_piece = !bio->bi_next && bio->bi_vcnt == 1; + cursor->last_piece = length <= bio->bi_io_vec[0].bv_len; } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data, @@ -784,8 +785,12 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data, BUG_ON(cursor->vector_offset >= bio_vec->bv_len); *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset); BUG_ON(*page_offset >= PAGE_SIZE); - *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); + if (cursor->last_piece) /* pagelist offset is always 0 */ + *length = cursor->resid; + else + *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); BUG_ON(*length > PAGE_SIZE); + BUG_ON(*length > cursor->resid); return bio_vec->bv_page; } @@ -805,26 +810,33 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes) index = cursor->vector_index; BUG_ON(index >= (unsigned int) bio->bi_vcnt); bio_vec = &bio->bi_io_vec[index]; - BUG_ON(cursor->vector_offset + bytes > bio_vec->bv_len); /* Advance the cursor offset */ + BUG_ON(cursor->resid < bytes); + cursor->resid -= bytes; cursor->vector_offset += bytes; if (cursor->vector_offset < bio_vec->bv_len) return false; /* more bytes to process in this segment */ + BUG_ON(cursor->vector_offset != bio_vec->bv_len); /* Move on to the next segment, and possibly the next bio */ - if (++cursor->vector_index == (unsigned int) bio->bi_vcnt) { + if (++index == (unsigned int) bio->bi_vcnt) { bio = bio->bi_next; - cursor->bio = bio; - cursor->vector_index = 0; + index = 0; } + cursor->bio = bio; + cursor->vector_index = index; cursor->vector_offset = 0; - if (!cursor->last_piece && bio && !bio->bi_next) - if (cursor->vector_index == (unsigned int) bio->bi_vcnt - 1) + if (!cursor->last_piece) { + BUG_ON(!cursor->resid); + BUG_ON(!bio); + /* A short read is OK, so use <= rather than == */ + if (cursor->resid <= bio->bi_io_vec[index].bv_len) cursor->last_piece = true; + } return true; } @@ -834,7 +846,8 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes) * For a page array, a piece comes from the first page in the array * that has not already been fully consumed. */ -static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data) +static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data, + size_t length) { struct ceph_msg_data_cursor *cursor = &data->cursor; int page_count; @@ -843,14 +856,15 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data) BUG_ON(!data->pages); BUG_ON(!data->length); + BUG_ON(length != data->length); + cursor->resid = length; page_count = calc_pages_for(data->alignment, (u64)data->length); - BUG_ON(page_count > (int) USHRT_MAX); - cursor->resid = data->length; cursor->page_offset = data->alignment & ~PAGE_MASK; cursor->page_index = 0; + BUG_ON(page_count > (int) USHRT_MAX); cursor->page_count = (unsigned short) page_count; - cursor->last_piece = cursor->page_count == 1; + cursor->last_piece = length <= PAGE_SIZE; } static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data, @@ -863,15 +877,12 @@ static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data, BUG_ON(cursor->page_index >= cursor->page_count); BUG_ON(cursor->page_offset >= PAGE_SIZE); - BUG_ON(!cursor->resid); *page_offset = cursor->page_offset; - if (cursor->last_piece) { - BUG_ON(*page_offset + cursor->resid > PAGE_SIZE); + if (cursor->last_piece) *length = cursor->resid; - } else { + else *length = PAGE_SIZE - *page_offset; - } return data->pages[cursor->page_index]; } @@ -884,7 +895,6 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data, BUG_ON(data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); - BUG_ON(bytes > cursor->resid); /* Advance the cursor page offset */ @@ -898,7 +908,7 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data, BUG_ON(cursor->page_index >= cursor->page_count); cursor->page_offset = 0; cursor->page_index++; - cursor->last_piece = cursor->page_index == cursor->page_count - 1; + cursor->last_piece = cursor->resid <= PAGE_SIZE; return true; } @@ -907,7 +917,8 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data, * For a pagelist, a piece is whatever remains to be consumed in the * first page in the list, or the front of the next page. */ -static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data) +static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data, + size_t length) { struct ceph_msg_data_cursor *cursor = &data->cursor; struct ceph_pagelist *pagelist; @@ -917,15 +928,18 @@ static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data) pagelist = data->pagelist; BUG_ON(!pagelist); - if (!pagelist->length) + BUG_ON(length != pagelist->length); + + if (!length) return; /* pagelist can be assigned but empty */ BUG_ON(list_empty(&pagelist->head)); page = list_first_entry(&pagelist->head, struct page, lru); + cursor->resid = length; cursor->page = page; cursor->offset = 0; - cursor->last_piece = pagelist->length <= PAGE_SIZE; + cursor->last_piece = length <= PAGE_SIZE; } static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data, @@ -934,7 +948,6 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data, { struct ceph_msg_data_cursor *cursor = &data->cursor; struct ceph_pagelist *pagelist; - size_t piece_end; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); @@ -942,18 +955,13 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data, BUG_ON(!pagelist); BUG_ON(!cursor->page); - BUG_ON(cursor->offset >= pagelist->length); + BUG_ON(cursor->offset + cursor->resid != pagelist->length); - if (cursor->last_piece) { - /* pagelist offset is always 0 */ - piece_end = pagelist->length & ~PAGE_MASK; - if (!piece_end) - piece_end = PAGE_SIZE; - } else { - piece_end = PAGE_SIZE; - } *page_offset = cursor->offset & ~PAGE_MASK; - *length = piece_end - *page_offset; + if (cursor->last_piece) /* pagelist offset is always 0 */ + *length = cursor->resid; + else + *length = PAGE_SIZE - *page_offset; return data->cursor.page; } @@ -968,12 +976,13 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data, pagelist = data->pagelist; BUG_ON(!pagelist); - BUG_ON(!cursor->page); - BUG_ON(cursor->offset + bytes > pagelist->length); + + BUG_ON(cursor->offset + cursor->resid != pagelist->length); BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); /* Advance the cursor offset */ + cursor->resid -= bytes; cursor->offset += bytes; /* pagelist offset is always 0 */ if (!bytes || cursor->offset & ~PAGE_MASK) @@ -983,10 +992,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data, BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); cursor->page = list_entry_next(cursor->page, lru); - - /* cursor offset is at page boundary; pagelist offset is always 0 */ - if (pagelist->length - cursor->offset <= PAGE_SIZE) - cursor->last_piece = true; + cursor->last_piece = cursor->resid <= PAGE_SIZE; return true; } @@ -999,18 +1005,19 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data, * be processed in that piece. It also tracks whether the current * piece is the last one in the data item. */ -static void ceph_msg_data_cursor_init(struct ceph_msg_data *data) +static void ceph_msg_data_cursor_init(struct ceph_msg_data *data, + size_t length) { switch (data->type) { case CEPH_MSG_DATA_PAGELIST: - ceph_msg_data_pagelist_cursor_init(data); + ceph_msg_data_pagelist_cursor_init(data, length); break; case CEPH_MSG_DATA_PAGES: - ceph_msg_data_pages_cursor_init(data); + ceph_msg_data_pages_cursor_init(data, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: - ceph_msg_data_bio_cursor_init(data); + ceph_msg_data_bio_cursor_init(data, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_NONE: @@ -1064,8 +1071,10 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data, */ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) { + struct ceph_msg_data_cursor *cursor = &data->cursor; bool new_piece; + BUG_ON(bytes > cursor->resid); switch (data->type) { case CEPH_MSG_DATA_PAGELIST: new_piece = ceph_msg_data_pagelist_advance(data, bytes); @@ -1090,8 +1099,12 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) static void prepare_message_data(struct ceph_msg *msg, struct ceph_msg_pos *msg_pos) { + size_t data_len; + BUG_ON(!msg); - BUG_ON(!msg->hdr.data_len); + + data_len = le32_to_cpu(msg->hdr.data_len); + BUG_ON(!data_len); /* initialize page iterator */ msg_pos->page = 0; @@ -1109,12 +1122,12 @@ static void prepare_message_data(struct ceph_msg *msg, #ifdef CONFIG_BLOCK if (ceph_msg_has_bio(msg)) - ceph_msg_data_cursor_init(&msg->b); + ceph_msg_data_cursor_init(&msg->b, data_len); #endif /* CONFIG_BLOCK */ if (ceph_msg_has_pages(msg)) - ceph_msg_data_cursor_init(&msg->p); + ceph_msg_data_cursor_init(&msg->p, data_len); if (ceph_msg_has_pagelist(msg)) - ceph_msg_data_cursor_init(&msg->l); + ceph_msg_data_cursor_init(&msg->l, data_len); msg_pos->did_page_crc = false; } -- cgit v1.2.3 From 463207aa40cf2cadcae84866b3f85ccaa7022ee8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: use cursor for bio reads Replace the use of the information in con->in_msg_pos for incoming bio data. The old in_msg_pos and the new cursor mechanism do basically the same thing, just slightly differently. The main functional difference is that in_msg_pos keeps track of the length of the complete bio list, and assumed it was fully consumed when that many bytes had been transferred. The cursor does not assume a length, it simply consumes all bytes in the bio list. Because the only user of bio data is the rbd client, and because the length of a bio list provided by rbd client always matches the number of bytes in the list, both ways of tracking length are equivalent. In addition, for in_msg_pos the initial bio vector is selected as the initial value of the bio->bi_idx, while the cursor assumes this is zero. Again, the rbd client always passes 0 as the initial index so the effect is the same. Other than that, they basically match: in_msg_pos cursor ---------- ------ bio_iter bio bio_seg vec_index page_pos page_offset The in_msg_pos field is initialized by a call to init_bio_iter(). The bio cursor is initialized by ceph_msg_data_cursor_init(). Both now happen in the same spot, in prepare_message_data(). The in_msg_pos field is advanced by a call to in_msg_pos_next(), which updates page_pos and calls iter_bio_next() to move to the next bio vector, or to the next bio in the list. The cursor is advanced by ceph_msg_data_advance(). That isn't currently happening so add a call to that in in_msg_pos_next(). Finally, the next piece of data to use for a read is determined by a bunch of lines in read_partial_message_bio(). Those can be replaced by an equivalent ceph_msg_data_bio_next() call. This partially resolves: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0ac4f6cb7339..c795d46d7d4b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1468,6 +1468,10 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, msg_pos->data_pos += received; msg_pos->page_pos += received; +#ifdef CONFIG_BLOCK + if (ceph_msg_has_bio(msg)) + (void) ceph_msg_data_advance(&msg->b, received); +#endif /* CONFIG_BLOCK */ if (received < len) return; @@ -2255,23 +2259,14 @@ static int read_partial_message_bio(struct ceph_connection *con, unsigned int data_len, bool do_datacrc) { struct ceph_msg *msg = con->in_msg; - struct ceph_msg_pos *msg_pos = &con->in_msg_pos; - struct bio_vec *bv; struct page *page; size_t page_offset; size_t length; - unsigned int left; int ret; BUG_ON(!msg); - BUG_ON(!msg->b.bio_iter); - bv = bio_iovec_idx(msg->b.bio_iter, msg->b.bio_seg); - page = bv->bv_page; - page_offset = bv->bv_offset + msg_pos->page_pos; - BUG_ON(msg_pos->data_pos >= data_len); - left = data_len - msg_pos->data_pos; - BUG_ON(msg_pos->page_pos >= bv->bv_len); - length = min_t(unsigned int, bv->bv_len - msg_pos->page_pos, left); + + page = ceph_msg_data_next(&msg->b, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) -- cgit v1.2.3 From 6518be47f910f62a98cb6044dbb457af55241f95 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: kill ceph message bio_iter, bio_seg The bio_iter and bio_seg fields in a message are no longer used, we use the cursor instead. So get rid of them and the functions that operate on them them. This is related to: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 6 +----- net/ceph/messenger.c | 31 ------------------------------- 2 files changed, 1 insertion(+), 36 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 459e55280bf8..252e01b7f7de 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -121,11 +121,7 @@ struct ceph_msg_data { enum ceph_msg_data_type type; union { #ifdef CONFIG_BLOCK - struct { - struct bio *bio_iter; /* iterator */ - struct bio *bio; - unsigned int bio_seg; /* current seg in bio */ - }; + struct bio *bio; #endif /* CONFIG_BLOCK */ struct { struct page **pages; /* NOT OWNER. */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index c795d46d7d4b..b634d2098777 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -716,29 +716,6 @@ static void con_out_kvec_add(struct ceph_connection *con, } #ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **bio_iter, - unsigned int *bio_seg) -{ - if (!bio) { - *bio_iter = NULL; - *bio_seg = 0; - return; - } - *bio_iter = bio; - *bio_seg = (unsigned int) bio->bi_idx; -} - -static void iter_bio_next(struct bio **bio_iter, unsigned int *seg) -{ - if (*bio_iter == NULL) - return; - - BUG_ON(*seg >= (*bio_iter)->bi_vcnt); - - (*seg)++; - if (*seg == (*bio_iter)->bi_vcnt) - init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); -} /* * For a bio data item, a piece is whatever remains of the next @@ -1112,10 +1089,6 @@ static void prepare_message_data(struct ceph_msg *msg, msg_pos->page_pos = msg->p.alignment; else msg_pos->page_pos = 0; -#ifdef CONFIG_BLOCK - if (ceph_msg_has_bio(msg)) - init_bio_iter(msg->b.bio, &msg->b.bio_iter, &msg->b.bio_seg); -#endif msg_pos->data_pos = 0; /* Initialize data cursors */ @@ -1478,10 +1451,6 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, BUG_ON(received != len); msg_pos->page_pos = 0; msg_pos->page++; -#ifdef CONFIG_BLOCK - if (msg->b.bio) - iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg); -#endif /* CONFIG_BLOCK */ } static u32 ceph_crc32c_page(u32 crc, struct page *page, -- cgit v1.2.3 From 878efabd3236abaedd0a4539bbb248ac69fed115 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: use cursor for inbound data pages The cursor code for a page array selects the right page, page offset, and length to use for a ceph_tcp_recvpage() call, so we can use it to replace a block in read_partial_message_pages(). This partially resolves: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b634d2098777..f81fbce136f8 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1441,8 +1441,10 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, msg_pos->data_pos += received; msg_pos->page_pos += received; + if (ceph_msg_has_pages(msg)) + (void) ceph_msg_data_advance(&msg->p, received); #ifdef CONFIG_BLOCK - if (ceph_msg_has_bio(msg)) + else if (ceph_msg_has_bio(msg)) (void) ceph_msg_data_advance(&msg->b, received); #endif /* CONFIG_BLOCK */ if (received < len) @@ -2192,23 +2194,12 @@ static int read_partial_message_pages(struct ceph_connection *con, unsigned int data_len, bool do_datacrc) { struct ceph_msg *msg = con->in_msg; - struct ceph_msg_pos *msg_pos = &con->in_msg_pos; - struct page **pages; struct page *page; size_t page_offset; size_t length; - unsigned int left; int ret; - /* (page) data */ - pages = msg->p.pages; - BUG_ON(pages == NULL); - page = pages[msg_pos->page]; - page_offset = msg_pos->page_pos; - BUG_ON(msg_pos->data_pos >= data_len); - left = data_len - msg_pos->data_pos; - BUG_ON(page_offset >= PAGE_SIZE); - length = min_t(unsigned int, PAGE_SIZE - page_offset, left); + page = ceph_msg_data_next(&msg->p, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) -- cgit v1.2.3 From 61fcdc97c06bce7b6d16dd2a6b478f24cd121d96 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:22 -0500 Subject: libceph: no outbound zero data There is handling in write_partial_message_data() for the case where only the length of--and no other information about--the data to be sent has been specified. It uses the zero page as the source of data to send in this case. This case doesn't occur. All message senders set up a page array, pagelist, or bio describing the data to be sent. So eliminate the block of code that handles this (but check and issue a warning for now, just in case it happens for some reason). This resolves: http://tracker.ceph.com/issues/4426 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f81fbce136f8..598d21830417 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1512,13 +1512,10 @@ static int write_partial_message_data(struct ceph_connection *con) &length, &last_piece); #endif } else { - size_t resid = data_len - msg_pos->data_pos; - - page = zero_page; - page_offset = msg_pos->page_pos; - length = PAGE_SIZE - page_offset; - length = min(resid, length); - last_piece = length == resid; + WARN(1, "con %p data_len %u but no outbound data\n", + con, data_len); + ret = -EINVAL; + goto out; } if (do_datacrc && !msg_pos->did_page_crc) { u32 crc = le32_to_cpu(msg->footer.data_crc); -- cgit v1.2.3 From 686be20875db63c6103573565c63db20153ee6e1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: get rid of read helpers Now that read_partial_message_pages() and read_partial_message_bio() are literally identical functions we can factor them out. They're pretty simple as well, so just move their relevant content into read_partial_msg_data(). This is and previous patches together resolve: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 80 ++++++++++++---------------------------------------- 1 file changed, 18 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 598d21830417..a19ba00ce777 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2185,66 +2185,15 @@ static int read_partial_message_section(struct ceph_connection *con, return 1; } -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); - -static int read_partial_message_pages(struct ceph_connection *con, - unsigned int data_len, bool do_datacrc) -{ - struct ceph_msg *msg = con->in_msg; - struct page *page; - size_t page_offset; - size_t length; - int ret; - - page = ceph_msg_data_next(&msg->p, &page_offset, &length, NULL); - - ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); - if (ret <= 0) - return ret; - - if (do_datacrc) - con->in_data_crc = ceph_crc32c_page(con->in_data_crc, page, - page_offset, ret); - - in_msg_pos_next(con, length, ret); - - return ret; -} - -#ifdef CONFIG_BLOCK -static int read_partial_message_bio(struct ceph_connection *con, - unsigned int data_len, bool do_datacrc) -{ - struct ceph_msg *msg = con->in_msg; - struct page *page; - size_t page_offset; - size_t length; - int ret; - - BUG_ON(!msg); - - page = ceph_msg_data_next(&msg->b, &page_offset, &length, NULL); - - ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); - if (ret <= 0) - return ret; - - if (do_datacrc) - con->in_data_crc = ceph_crc32c_page(con->in_data_crc, page, - page_offset, ret); - - in_msg_pos_next(con, length, ret); - - return ret; -} -#endif - static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; struct ceph_msg_pos *msg_pos = &con->in_msg_pos; const bool do_datacrc = !con->msgr->nocrc; unsigned int data_len; + struct page *page; + size_t page_offset; + size_t length; int ret; BUG_ON(!msg); @@ -2252,20 +2201,25 @@ static int read_partial_msg_data(struct ceph_connection *con) data_len = le32_to_cpu(con->in_hdr.data_len); while (msg_pos->data_pos < data_len) { if (ceph_msg_has_pages(msg)) { - ret = read_partial_message_pages(con, data_len, - do_datacrc); - if (ret <= 0) - return ret; + page = ceph_msg_data_next(&msg->p, &page_offset, + &length, NULL); #ifdef CONFIG_BLOCK } else if (ceph_msg_has_bio(msg)) { - ret = read_partial_message_bio(con, - data_len, do_datacrc); - if (ret <= 0) - return ret; + page = ceph_msg_data_next(&msg->b, &page_offset, + &length, NULL); #endif } else { BUG_ON(1); } + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); + if (ret <= 0) + return ret; + + if (do_datacrc) + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + page, page_offset, ret); + + in_msg_pos_next(con, length, ret); } return 1; /* must return > 0 to indicate success */ @@ -2274,6 +2228,8 @@ static int read_partial_msg_data(struct ceph_connection *con) /* * read (part of) a message. */ +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); + static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; -- cgit v1.2.3 From 4c59b4a278f9b7a418ad8af933fd7b341df64393 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: collapse all data items into one It turns out that only one of the data item types is ever used at any one time in a single message (currently). - A page array is used by the osd client (on behalf of the file system) and by rbd. Only one osd op (and therefore at most one data item) is ever used at a time by rbd. And the only time the file system sends two, the second op contains no data. - A bio is only used by the rbd client (and again, only one data item per message) - A page list is used by the file system and by rbd for outgoing data, but only one op (and one data item) at a time. We can therefore collapse all three of our data item fields into a single field "data", and depend on the messenger code to properly handle it based on its type. This allows us to eliminate quite a bit of duplicated code. This is related to: http://tracker.ceph.com/issues/4429 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 12 +--- net/ceph/messenger.c | 123 +++++++++++++---------------------------- 2 files changed, 40 insertions(+), 95 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 252e01b7f7de..af786b29f7a4 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,11 +64,7 @@ struct ceph_messenger { u32 required_features; }; -#define ceph_msg_has_pages(m) ((m)->p.type == CEPH_MSG_DATA_PAGES) -#define ceph_msg_has_pagelist(m) ((m)->l.type == CEPH_MSG_DATA_PAGELIST) -#ifdef CONFIG_BLOCK -#define ceph_msg_has_bio(m) ((m)->b.type == CEPH_MSG_DATA_BIO) -#endif /* CONFIG_BLOCK */ +#define ceph_msg_has_data(m) ((m)->data.type != CEPH_MSG_DATA_NONE) enum ceph_msg_data_type { CEPH_MSG_DATA_NONE, /* message contains no data payload */ @@ -145,11 +141,7 @@ struct ceph_msg { struct ceph_buffer *middle; /* data payload */ - struct ceph_msg_data p; /* pages */ - struct ceph_msg_data l; /* pagelist */ -#ifdef CONFIG_BLOCK - struct ceph_msg_data b; /* bio */ -#endif /* CONFIG_BLOCK */ + struct ceph_msg_data data; struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index a19ba00ce777..6b5b5c625547 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1085,22 +1085,15 @@ static void prepare_message_data(struct ceph_msg *msg, /* initialize page iterator */ msg_pos->page = 0; - if (ceph_msg_has_pages(msg)) - msg_pos->page_pos = msg->p.alignment; + if (ceph_msg_has_data(msg)) + msg_pos->page_pos = msg->data.alignment; else msg_pos->page_pos = 0; msg_pos->data_pos = 0; - /* Initialize data cursors */ + /* Initialize data cursor */ -#ifdef CONFIG_BLOCK - if (ceph_msg_has_bio(msg)) - ceph_msg_data_cursor_init(&msg->b, data_len); -#endif /* CONFIG_BLOCK */ - if (ceph_msg_has_pages(msg)) - ceph_msg_data_cursor_init(&msg->p, data_len); - if (ceph_msg_has_pagelist(msg)) - ceph_msg_data_cursor_init(&msg->l, data_len); + ceph_msg_data_cursor_init(&msg->data, data_len); msg_pos->did_page_crc = false; } @@ -1166,10 +1159,10 @@ static void prepare_write_message(struct ceph_connection *con) m->needs_out_seq = false; } - dout("prepare_write_message %p seq %lld type %d len %d+%d+%d (%zd)\n", + dout("prepare_write_message %p seq %lld type %d len %d+%d+%d\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - le32_to_cpu(m->hdr.data_len), m->p.length); + le32_to_cpu(m->hdr.data_len)); BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ @@ -1411,14 +1404,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, msg_pos->data_pos += sent; msg_pos->page_pos += sent; - if (ceph_msg_has_pages(msg)) - need_crc = ceph_msg_data_advance(&msg->p, sent); - else if (ceph_msg_has_pagelist(msg)) - need_crc = ceph_msg_data_advance(&msg->l, sent); -#ifdef CONFIG_BLOCK - else if (ceph_msg_has_bio(msg)) - need_crc = ceph_msg_data_advance(&msg->b, sent); -#endif /* CONFIG_BLOCK */ + need_crc = ceph_msg_data_advance(&msg->data, sent); BUG_ON(need_crc && sent != len); if (sent < len) @@ -1441,12 +1427,8 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, msg_pos->data_pos += received; msg_pos->page_pos += received; - if (ceph_msg_has_pages(msg)) - (void) ceph_msg_data_advance(&msg->p, received); -#ifdef CONFIG_BLOCK - else if (ceph_msg_has_bio(msg)) - (void) ceph_msg_data_advance(&msg->b, received); -#endif /* CONFIG_BLOCK */ + (void) ceph_msg_data_advance(&msg->data, received); + if (received < len) return; @@ -1486,6 +1468,9 @@ static int write_partial_message_data(struct ceph_connection *con) dout("%s %p msg %p page %d offset %d\n", __func__, con, msg, msg_pos->page, msg_pos->page_pos); + if (WARN_ON(!ceph_msg_has_data(msg))) + return -EINVAL; + /* * Iterate through each page that contains data to be * written, and send as much as possible for each. @@ -1500,23 +1485,8 @@ static int write_partial_message_data(struct ceph_connection *con) size_t length; bool last_piece; - if (ceph_msg_has_pages(msg)) { - page = ceph_msg_data_next(&msg->p, &page_offset, - &length, &last_piece); - } else if (ceph_msg_has_pagelist(msg)) { - page = ceph_msg_data_next(&msg->l, &page_offset, - &length, &last_piece); -#ifdef CONFIG_BLOCK - } else if (ceph_msg_has_bio(msg)) { - page = ceph_msg_data_next(&msg->b, &page_offset, - &length, &last_piece); -#endif - } else { - WARN(1, "con %p data_len %u but no outbound data\n", - con, data_len); - ret = -EINVAL; - goto out; - } + page = ceph_msg_data_next(&msg->data, &page_offset, &length, + &last_piece); if (do_datacrc && !msg_pos->did_page_crc) { u32 crc = le32_to_cpu(msg->footer.data_crc); @@ -2197,20 +2167,13 @@ static int read_partial_msg_data(struct ceph_connection *con) int ret; BUG_ON(!msg); + if (WARN_ON(!ceph_msg_has_data(msg))) + return -EIO; data_len = le32_to_cpu(con->in_hdr.data_len); while (msg_pos->data_pos < data_len) { - if (ceph_msg_has_pages(msg)) { - page = ceph_msg_data_next(&msg->p, &page_offset, - &length, NULL); -#ifdef CONFIG_BLOCK - } else if (ceph_msg_has_bio(msg)) { - page = ceph_msg_data_next(&msg->b, &page_offset, - &length, NULL); -#endif - } else { - BUG_ON(1); - } + page = ceph_msg_data_next(&msg->data, &page_offset, &length, + NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) return ret; @@ -2218,7 +2181,6 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) con->in_data_crc = ceph_crc32c_page(con->in_data_crc, page, page_offset, ret); - in_msg_pos_next(con, length, ret); } @@ -3043,12 +3005,12 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, { BUG_ON(!pages); BUG_ON(!length); - BUG_ON(msg->p.type != CEPH_MSG_DATA_NONE); + BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE); - msg->p.type = CEPH_MSG_DATA_PAGES; - msg->p.pages = pages; - msg->p.length = length; - msg->p.alignment = alignment & ~PAGE_MASK; + msg->data.type = CEPH_MSG_DATA_PAGES; + msg->data.pages = pages; + msg->data.length = length; + msg->data.alignment = alignment & ~PAGE_MASK; } EXPORT_SYMBOL(ceph_msg_data_set_pages); @@ -3057,20 +3019,20 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, { BUG_ON(!pagelist); BUG_ON(!pagelist->length); - BUG_ON(msg->l.type != CEPH_MSG_DATA_NONE); + BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE); - msg->l.type = CEPH_MSG_DATA_PAGELIST; - msg->l.pagelist = pagelist; + msg->data.type = CEPH_MSG_DATA_PAGELIST; + msg->data.pagelist = pagelist; } EXPORT_SYMBOL(ceph_msg_data_set_pagelist); void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio) { BUG_ON(!bio); - BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE); + BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE); - msg->b.type = CEPH_MSG_DATA_BIO; - msg->b.bio = bio; + msg->data.type = CEPH_MSG_DATA_BIO; + msg->data.bio = bio; } EXPORT_SYMBOL(ceph_msg_data_set_bio); @@ -3094,9 +3056,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); - ceph_msg_data_init(&m->p); - ceph_msg_data_init(&m->l); - ceph_msg_data_init(&m->b); + ceph_msg_data_init(&m->data); /* front */ m->front_max = front_len; @@ -3251,20 +3211,13 @@ void ceph_msg_last_put(struct kref *kref) ceph_buffer_put(m->middle); m->middle = NULL; } - if (ceph_msg_has_pages(m)) { - m->p.length = 0; - m->p.pages = NULL; - m->p.type = CEPH_OSD_DATA_TYPE_NONE; - } - if (ceph_msg_has_pagelist(m)) { - ceph_pagelist_release(m->l.pagelist); - kfree(m->l.pagelist); - m->l.pagelist = NULL; - m->l.type = CEPH_OSD_DATA_TYPE_NONE; - } - if (ceph_msg_has_bio(m)) { - m->b.bio = NULL; - m->b.type = CEPH_OSD_DATA_TYPE_NONE; + if (ceph_msg_has_data(m)) { + if (m->data.type == CEPH_MSG_DATA_PAGELIST) { + ceph_pagelist_release(m->data.pagelist); + kfree(m->data.pagelist); + } + memset(&m->data, 0, sizeof m->data); + ceph_msg_data_init(&m->data); } if (m->pool) @@ -3277,7 +3230,7 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, - msg->front_max, msg->p.length); + msg->front_max, msg->data.length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); -- cgit v1.2.3 From 643c68a4a990612720479078f3450d5b766da9f2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: use cursor resid for loop condition Use the "resid" field of a cursor rather than finding when the message data position has moved up to meet the data length to determine when all data has been sent or received in write_partial_message_data() and read_partial_msg_data(). This is cleanup of old code related to: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6b5b5c625547..2fabf006e8f5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1460,8 +1460,8 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page, static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; + struct ceph_msg_data_cursor *cursor = &msg->data.cursor; struct ceph_msg_pos *msg_pos = &con->out_msg_pos; - unsigned int data_len = le32_to_cpu(msg->hdr.data_len); bool do_datacrc = !con->msgr->nocrc; int ret; @@ -1479,7 +1479,7 @@ static int write_partial_message_data(struct ceph_connection *con) * need to map the page. If we have no pages, they have * been revoked, so use the zero page. */ - while (data_len > msg_pos->data_pos) { + while (cursor->resid) { struct page *page; size_t page_offset; size_t length; @@ -1489,7 +1489,6 @@ static int write_partial_message_data(struct ceph_connection *con) &last_piece); if (do_datacrc && !msg_pos->did_page_crc) { u32 crc = le32_to_cpu(msg->footer.data_crc); - crc = ceph_crc32c_page(crc, page, page_offset, length); msg->footer.data_crc = cpu_to_le32(crc); msg_pos->did_page_crc = true; @@ -2158,7 +2157,7 @@ static int read_partial_message_section(struct ceph_connection *con, static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; - struct ceph_msg_pos *msg_pos = &con->in_msg_pos; + struct ceph_msg_data_cursor *cursor = &msg->data.cursor; const bool do_datacrc = !con->msgr->nocrc; unsigned int data_len; struct page *page; @@ -2171,7 +2170,7 @@ static int read_partial_msg_data(struct ceph_connection *con) return -EIO; data_len = le32_to_cpu(con->in_hdr.data_len); - while (msg_pos->data_pos < data_len) { + while (cursor->resid) { page = ceph_msg_data_next(&msg->data, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); -- cgit v1.2.3 From 859a35d5523e8e6a5c3568c12febe2e1270bc3a1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: kill most of ceph_msg_pos All but one of the fields in the ceph_msg_pos structure are now never used (only assigned), so get rid of them. This allows several small blocks of code to go away. This is cleanup of old code related to: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 -- net/ceph/messenger.c | 22 +--------------------- 2 files changed, 1 insertion(+), 23 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index af786b29f7a4..c76b228cb524 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -157,8 +157,6 @@ struct ceph_msg { }; struct ceph_msg_pos { - int page, page_pos; /* which page; offset in page */ - int data_pos; /* offset in data payload */ bool did_page_crc; /* true if we've calculated crc for current page */ }; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2fabf006e8f5..19f9fffc170c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1083,14 +1083,6 @@ static void prepare_message_data(struct ceph_msg *msg, data_len = le32_to_cpu(msg->hdr.data_len); BUG_ON(!data_len); - /* initialize page iterator */ - msg_pos->page = 0; - if (ceph_msg_has_data(msg)) - msg_pos->page_pos = msg->data.alignment; - else - msg_pos->page_pos = 0; - msg_pos->data_pos = 0; - /* Initialize data cursor */ ceph_msg_data_cursor_init(&msg->data, data_len); @@ -1402,8 +1394,6 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, BUG_ON(!msg); BUG_ON(!sent); - msg_pos->data_pos += sent; - msg_pos->page_pos += sent; need_crc = ceph_msg_data_advance(&msg->data, sent); BUG_ON(need_crc && sent != len); @@ -1411,8 +1401,6 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, return; BUG_ON(sent != len); - msg_pos->page_pos = 0; - msg_pos->page++; msg_pos->did_page_crc = false; } @@ -1420,21 +1408,16 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len, size_t received) { struct ceph_msg *msg = con->in_msg; - struct ceph_msg_pos *msg_pos = &con->in_msg_pos; BUG_ON(!msg); BUG_ON(!received); - msg_pos->data_pos += received; - msg_pos->page_pos += received; (void) ceph_msg_data_advance(&msg->data, received); if (received < len) return; BUG_ON(received != len); - msg_pos->page_pos = 0; - msg_pos->page++; } static u32 ceph_crc32c_page(u32 crc, struct page *page, @@ -1465,8 +1448,7 @@ static int write_partial_message_data(struct ceph_connection *con) bool do_datacrc = !con->msgr->nocrc; int ret; - dout("%s %p msg %p page %d offset %d\n", __func__, - con, msg, msg_pos->page, msg_pos->page_pos); + dout("%s %p msg %p\n", __func__, con, msg); if (WARN_ON(!ceph_msg_has_data(msg))) return -EINVAL; @@ -2159,7 +2141,6 @@ static int read_partial_msg_data(struct ceph_connection *con) struct ceph_msg *msg = con->in_msg; struct ceph_msg_data_cursor *cursor = &msg->data.cursor; const bool do_datacrc = !con->msgr->nocrc; - unsigned int data_len; struct page *page; size_t page_offset; size_t length; @@ -2169,7 +2150,6 @@ static int read_partial_msg_data(struct ceph_connection *con) if (WARN_ON(!ceph_msg_has_data(msg))) return -EIO; - data_len = le32_to_cpu(con->in_hdr.data_len); while (cursor->resid) { page = ceph_msg_data_next(&msg->data, &page_offset, &length, NULL); -- cgit v1.2.3 From f5db90bcf2c69d099f9d828a8104796f41de6bc5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: kill last of ceph_msg_pos The only remaining field in the ceph_msg_pos structure is did_page_crc. In the new cursor model of things that flag (or something like it) belongs in the cursor. Define a new field "need_crc" in the cursor (which applies to all types of data) and initialize it to true whenever a cursor is initialized. In write_partial_message_data(), the data CRC still will be computed as before, but it will check the cursor->need_crc field to determine whether it's needed. Any time the cursor is advanced to a new piece of a data item, need_crc will be set, and this will cause the crc for that entire piece to be accumulated into the data crc. In write_partial_message_data() the intermediate crc value is now held in a local variable so it doesn't have to be byte-swapped so many times. In read_partial_msg_data() we do something similar (but mainly for consistency there). With that, the ceph_msg_pos structure can go away, and it no longer needs to be passed as an argument to prepare_message_data(). This cleanup is related to: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 7 +----- net/ceph/messenger.c | 56 +++++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c76b228cb524..686df5bfa717 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -93,6 +93,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) struct ceph_msg_data_cursor { size_t resid; /* bytes not yet consumed */ bool last_piece; /* now at last piece of data item */ + bool need_crc; /* new piece; crc update needed */ union { #ifdef CONFIG_BLOCK struct { /* bio */ @@ -156,10 +157,6 @@ struct ceph_msg { struct ceph_msgpool *pool; }; -struct ceph_msg_pos { - bool did_page_crc; /* true if we've calculated crc for current page */ -}; - /* ceph connection fault delay defaults, for exponential backoff */ #define BASE_DELAY_INTERVAL (HZ/2) #define MAX_DELAY_INTERVAL (5 * 60 * HZ) @@ -217,7 +214,6 @@ struct ceph_connection { struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ bool out_msg_done; - struct ceph_msg_pos out_msg_pos; struct kvec out_kvec[8], /* sending header/footer data */ *out_kvec_cur; @@ -231,7 +227,6 @@ struct ceph_connection { /* message in temps */ struct ceph_msg_header in_hdr; struct ceph_msg *in_msg; - struct ceph_msg_pos in_msg_pos; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ char in_tag; /* protocol control byte */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 19f9fffc170c..eee7a878dbfb 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1002,6 +1002,7 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data, /* BUG(); */ break; } + data->cursor.need_crc = true; } /* @@ -1069,12 +1070,12 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) BUG(); break; } + data->cursor.need_crc = new_piece; return new_piece; } -static void prepare_message_data(struct ceph_msg *msg, - struct ceph_msg_pos *msg_pos) +static void prepare_message_data(struct ceph_msg *msg) { size_t data_len; @@ -1086,8 +1087,6 @@ static void prepare_message_data(struct ceph_msg *msg, /* Initialize data cursor */ ceph_msg_data_cursor_init(&msg->data, data_len); - - msg_pos->did_page_crc = false; } /* @@ -1186,7 +1185,7 @@ static void prepare_write_message(struct ceph_connection *con) /* is there a data payload? */ con->out_msg->footer.data_crc = 0; if (m->hdr.data_len) { - prepare_message_data(con->out_msg, &con->out_msg_pos); + prepare_message_data(con->out_msg); con->out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ @@ -1388,8 +1387,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, size_t len, size_t sent) { struct ceph_msg *msg = con->out_msg; - struct ceph_msg_pos *msg_pos = &con->out_msg_pos; - bool need_crc = false; + bool need_crc; BUG_ON(!msg); BUG_ON(!sent); @@ -1401,7 +1399,6 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, return; BUG_ON(sent != len); - msg_pos->did_page_crc = false; } static void in_msg_pos_next(struct ceph_connection *con, size_t len, @@ -1444,9 +1441,8 @@ static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; struct ceph_msg_data_cursor *cursor = &msg->data.cursor; - struct ceph_msg_pos *msg_pos = &con->out_msg_pos; bool do_datacrc = !con->msgr->nocrc; - int ret; + u32 crc; dout("%s %p msg %p\n", __func__, con, msg); @@ -1461,38 +1457,40 @@ static int write_partial_message_data(struct ceph_connection *con) * need to map the page. If we have no pages, they have * been revoked, so use the zero page. */ + crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; while (cursor->resid) { struct page *page; size_t page_offset; size_t length; bool last_piece; + int ret; page = ceph_msg_data_next(&msg->data, &page_offset, &length, &last_piece); - if (do_datacrc && !msg_pos->did_page_crc) { - u32 crc = le32_to_cpu(msg->footer.data_crc); + if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); - msg->footer.data_crc = cpu_to_le32(crc); - msg_pos->did_page_crc = true; - } ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, last_piece); - if (ret <= 0) - goto out; + if (ret <= 0) { + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + return ret; + } out_msg_pos_next(con, page, length, (size_t) ret); } dout("%s %p msg %p done\n", __func__, con, msg); /* prepare and queue up footer, too */ - if (!do_datacrc) + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + else msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); prepare_write_message_footer(con); - ret = 1; -out: - return ret; + + return 1; /* must return > 0 to indicate success */ } /* @@ -2144,24 +2142,32 @@ static int read_partial_msg_data(struct ceph_connection *con) struct page *page; size_t page_offset; size_t length; + u32 crc = 0; int ret; BUG_ON(!msg); if (WARN_ON(!ceph_msg_has_data(msg))) return -EIO; + if (do_datacrc) + crc = con->in_data_crc; while (cursor->resid) { page = ceph_msg_data_next(&msg->data, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); - if (ret <= 0) + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; + return ret; + } if (do_datacrc) - con->in_data_crc = ceph_crc32c_page(con->in_data_crc, - page, page_offset, ret); + crc = ceph_crc32c_page(crc, page, page_offset, ret); in_msg_pos_next(con, length, ret); } + if (do_datacrc) + con->in_data_crc = crc; return 1; /* must return > 0 to indicate success */ } @@ -2257,7 +2263,7 @@ static int read_partial_message(struct ceph_connection *con) /* prepare for data payload, if any */ if (data_len) - prepare_message_data(con->in_msg, &con->in_msg_pos); + prepare_message_data(con->in_msg); } /* front */ -- cgit v1.2.3 From 143334ff446d634fcd3145919b5cddcc9148a74a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 29 Mar 2013 11:44:10 -0500 Subject: libceph: don't add to crc unless data sent In write_partial_message_data() we aggregate the crc for the data portion of the message as each new piece of the data item is encountered. Because it was computed *before* sending the data, if an attempt to send a new piece resulted in 0 bytes being sent, the crc crc across that piece would erroneously get computed again and added to the aggregate result. This would occasionally happen in the evnet of a connection failure. The crc value isn't really needed until the complete value is known after sending all data, so there's no need to compute it before sending. So don't calculate the crc for a piece until *after* we know at least one byte of it has been sent. That will avoid this problem. This resolves: http://tracker.ceph.com/issues/4450 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index eee7a878dbfb..cb8b571ce79a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1467,8 +1467,6 @@ static int write_partial_message_data(struct ceph_connection *con) page = ceph_msg_data_next(&msg->data, &page_offset, &length, &last_piece); - if (do_datacrc && cursor->need_crc) - crc = ceph_crc32c_page(crc, page, page_offset, length); ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, last_piece); if (ret <= 0) { @@ -1477,6 +1475,8 @@ static int write_partial_message_data(struct ceph_connection *con) return ret; } + if (do_datacrc && cursor->need_crc) + crc = ceph_crc32c_page(crc, page, page_offset, length); out_msg_pos_next(con, page, length, (size_t) ret); } -- cgit v1.2.3 From 8ea299bcbc85aeaf5348d99614b35433287bec6b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: use only ceph_msg_data_advance() The *_msg_pos_next() functions do little more than call ceph_msg_data_advance(). Replace those wrapper functions with a simple call to ceph_msg_data_advance(). This cleanup is related to: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 39 +++------------------------------------ 1 file changed, 3 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index cb8b571ce79a..dd4b8226a48a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1383,40 +1383,6 @@ out: return ret; /* done! */ } -static void out_msg_pos_next(struct ceph_connection *con, struct page *page, - size_t len, size_t sent) -{ - struct ceph_msg *msg = con->out_msg; - bool need_crc; - - BUG_ON(!msg); - BUG_ON(!sent); - - need_crc = ceph_msg_data_advance(&msg->data, sent); - BUG_ON(need_crc && sent != len); - - if (sent < len) - return; - - BUG_ON(sent != len); -} - -static void in_msg_pos_next(struct ceph_connection *con, size_t len, - size_t received) -{ - struct ceph_msg *msg = con->in_msg; - - BUG_ON(!msg); - BUG_ON(!received); - - (void) ceph_msg_data_advance(&msg->data, received); - - if (received < len) - return; - - BUG_ON(received != len); -} - static u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, unsigned int length) @@ -1463,6 +1429,7 @@ static int write_partial_message_data(struct ceph_connection *con) size_t page_offset; size_t length; bool last_piece; + bool need_crc; int ret; page = ceph_msg_data_next(&msg->data, &page_offset, &length, @@ -1477,7 +1444,7 @@ static int write_partial_message_data(struct ceph_connection *con) } if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); - out_msg_pos_next(con, page, length, (size_t) ret); + need_crc = ceph_msg_data_advance(&msg->data, (size_t) ret); } dout("%s %p msg %p done\n", __func__, con, msg); @@ -2164,7 +2131,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = ceph_crc32c_page(crc, page, page_offset, ret); - in_msg_pos_next(con, length, ret); + (void) ceph_msg_data_advance(&msg->data, (size_t) ret); } if (do_datacrc) con->in_data_crc = crc; -- cgit v1.2.3 From 6644ed7b7e04f8e588aebdaa58cededb9416ab95 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:24 -0500 Subject: libceph: make message data be a pointer Begin the transition from a single message data item to a list of them by replacing the "data" structure in a message with a pointer to a ceph_msg_data structure. A null pointer will indicate the message has no data; replace the use of ceph_msg_has_data() with a simple check for a null pointer. Create functions ceph_msg_data_create() and ceph_msg_data_destroy() to dynamically allocate and free a data item structure of a given type. When a message has its data item "set," allocate one of these to hold the data description, and free it when the last reference to the message is dropped. This partially resolves: http://tracker.ceph.com/issues/4429 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 5 +-- net/ceph/messenger.c | 94 +++++++++++++++++++++++++++--------------- 2 files changed, 62 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 686df5bfa717..3181321bed6d 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,8 +64,6 @@ struct ceph_messenger { u32 required_features; }; -#define ceph_msg_has_data(m) ((m)->data.type != CEPH_MSG_DATA_NONE) - enum ceph_msg_data_type { CEPH_MSG_DATA_NONE, /* message contains no data payload */ CEPH_MSG_DATA_PAGES, /* data source/destination is a page array */ @@ -141,8 +139,7 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; - /* data payload */ - struct ceph_msg_data data; + struct ceph_msg_data *data; /* data payload */ struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index dd4b8226a48a..d4e46d8a088c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1086,7 +1086,7 @@ static void prepare_message_data(struct ceph_msg *msg) /* Initialize data cursor */ - ceph_msg_data_cursor_init(&msg->data, data_len); + ceph_msg_data_cursor_init(msg->data, data_len); } /* @@ -1406,13 +1406,13 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page, static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; - struct ceph_msg_data_cursor *cursor = &msg->data.cursor; + struct ceph_msg_data_cursor *cursor = &msg->data->cursor; bool do_datacrc = !con->msgr->nocrc; u32 crc; dout("%s %p msg %p\n", __func__, con, msg); - if (WARN_ON(!ceph_msg_has_data(msg))) + if (WARN_ON(!msg->data)) return -EINVAL; /* @@ -1432,7 +1432,7 @@ static int write_partial_message_data(struct ceph_connection *con) bool need_crc; int ret; - page = ceph_msg_data_next(&msg->data, &page_offset, &length, + page = ceph_msg_data_next(msg->data, &page_offset, &length, &last_piece); ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, last_piece); @@ -1444,7 +1444,7 @@ static int write_partial_message_data(struct ceph_connection *con) } if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); - need_crc = ceph_msg_data_advance(&msg->data, (size_t) ret); + need_crc = ceph_msg_data_advance(msg->data, (size_t)ret); } dout("%s %p msg %p done\n", __func__, con, msg); @@ -2104,7 +2104,7 @@ static int read_partial_message_section(struct ceph_connection *con, static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = &msg->data.cursor; + struct ceph_msg_data_cursor *cursor = &msg->data->cursor; const bool do_datacrc = !con->msgr->nocrc; struct page *page; size_t page_offset; @@ -2113,13 +2113,13 @@ static int read_partial_msg_data(struct ceph_connection *con) int ret; BUG_ON(!msg); - if (WARN_ON(!ceph_msg_has_data(msg))) + if (!msg->data) return -EIO; if (do_datacrc) crc = con->in_data_crc; while (cursor->resid) { - page = ceph_msg_data_next(&msg->data, &page_offset, &length, + page = ceph_msg_data_next(msg->data, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) { @@ -2131,7 +2131,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = ceph_crc32c_page(crc, page, page_offset, ret); - (void) ceph_msg_data_advance(&msg->data, (size_t) ret); + (void) ceph_msg_data_advance(msg->data, (size_t)ret); } if (do_datacrc) con->in_data_crc = crc; @@ -2947,44 +2947,80 @@ void ceph_con_keepalive(struct ceph_connection *con) } EXPORT_SYMBOL(ceph_con_keepalive); -static void ceph_msg_data_init(struct ceph_msg_data *data) +static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) { - data->type = CEPH_MSG_DATA_NONE; + struct ceph_msg_data *data; + + if (WARN_ON(!ceph_msg_data_type_valid(type))) + return NULL; + + data = kzalloc(sizeof (*data), GFP_NOFS); + if (data) + data->type = type; + + return data; +} + +static void ceph_msg_data_destroy(struct ceph_msg_data *data) +{ + if (!data) + return; + + if (data->type == CEPH_MSG_DATA_PAGELIST) { + ceph_pagelist_release(data->pagelist); + kfree(data->pagelist); + } + kfree(data); } void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment) { + struct ceph_msg_data *data; + BUG_ON(!pages); BUG_ON(!length); - BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE); + BUG_ON(msg->data != NULL); + + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); + BUG_ON(!data); + data->pages = pages; + data->length = length; + data->alignment = alignment & ~PAGE_MASK; - msg->data.type = CEPH_MSG_DATA_PAGES; - msg->data.pages = pages; - msg->data.length = length; - msg->data.alignment = alignment & ~PAGE_MASK; + msg->data = data; } EXPORT_SYMBOL(ceph_msg_data_set_pages); void ceph_msg_data_set_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist) { + struct ceph_msg_data *data; + BUG_ON(!pagelist); BUG_ON(!pagelist->length); - BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE); + BUG_ON(msg->data != NULL); - msg->data.type = CEPH_MSG_DATA_PAGELIST; - msg->data.pagelist = pagelist; + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); + BUG_ON(!data); + data->pagelist = pagelist; + + msg->data = data; } EXPORT_SYMBOL(ceph_msg_data_set_pagelist); void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio) { + struct ceph_msg_data *data; + BUG_ON(!bio); - BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE); + BUG_ON(msg->data != NULL); - msg->data.type = CEPH_MSG_DATA_BIO; - msg->data.bio = bio; + data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); + BUG_ON(!data); + data->bio = bio; + + msg->data = data; } EXPORT_SYMBOL(ceph_msg_data_set_bio); @@ -3008,8 +3044,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); - ceph_msg_data_init(&m->data); - /* front */ m->front_max = front_len; if (front_len) { @@ -3163,14 +3197,8 @@ void ceph_msg_last_put(struct kref *kref) ceph_buffer_put(m->middle); m->middle = NULL; } - if (ceph_msg_has_data(m)) { - if (m->data.type == CEPH_MSG_DATA_PAGELIST) { - ceph_pagelist_release(m->data.pagelist); - kfree(m->data.pagelist); - } - memset(&m->data, 0, sizeof m->data); - ceph_msg_data_init(&m->data); - } + ceph_msg_data_destroy(m->data); + m->data = NULL; if (m->pool) ceph_msgpool_put(m->pool, m); @@ -3182,7 +3210,7 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, - msg->front_max, msg->data.length); + msg->front_max, msg->data->length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); -- cgit v1.2.3 From 1190bf06a6b033384a65b5acdb1193d41cd257a6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 30 Mar 2013 13:31:02 -0500 Subject: libceph: fix broken data length assertions It's OK for the result of a read to come back with fewer bytes than were requested. So don't trigger a BUG() in that case when initializing the data cursor. This resolves the first problem described in: http://tracker.ceph.com/issues/4598 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d4e46d8a088c..24f3aba34800 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -833,7 +833,7 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data, BUG_ON(!data->pages); BUG_ON(!data->length); - BUG_ON(length != data->length); + BUG_ON(length > data->length); /* short reads are OK */ cursor->resid = length; page_count = calc_pages_for(data->alignment, (u64)data->length); @@ -905,7 +905,7 @@ static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data, pagelist = data->pagelist; BUG_ON(!pagelist); - BUG_ON(length != pagelist->length); + BUG_ON(length > pagelist->length); /* short reads are OK */ if (!length) return; /* pagelist can be assigned but empty */ -- cgit v1.2.3 From 5df521b1eecf276c4bae8ffb7945acef45530449 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 30 Mar 2013 15:09:59 -0500 Subject: libceph: page offset must be less than page size Currently ceph_msg_data_pages_advance() allows the page offset value to be PAGE_SIZE, apparently assuming ceph_msg_data_pages_next() will treat it as 0. But that doesn't happen, and the result led to a helpful assertion failure. Change ceph_msg_data_pages_advance() to truncate the offset to 0 before returning if it reaches PAGE_SIZE. Make a few other minor adjustments in this area (comments and a better assertion) while modifying it. This resolves a second issue described in: http://tracker.ceph.com/issues/4598 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 24f3aba34800..198b9026288e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -766,8 +766,8 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data, *length = cursor->resid; else *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); - BUG_ON(*length > PAGE_SIZE); BUG_ON(*length > cursor->resid); + BUG_ON(*page_offset + *length > PAGE_SIZE); return bio_vec->bv_page; } @@ -876,14 +876,13 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data, /* Advance the cursor page offset */ cursor->resid -= bytes; - cursor->page_offset += bytes; - if (!bytes || cursor->page_offset & ~PAGE_MASK) + cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; + if (!bytes || cursor->page_offset) return false; /* more bytes to process in the current page */ - /* Move on to the next page */ + /* Move on to the next page; offset is already at 0 */ BUG_ON(cursor->page_index >= cursor->page_count); - cursor->page_offset = 0; cursor->page_index++; cursor->last_piece = cursor->resid <= PAGE_SIZE; @@ -934,8 +933,9 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data, BUG_ON(!cursor->page); BUG_ON(cursor->offset + cursor->resid != pagelist->length); + /* offset of first page in pagelist is always 0 */ *page_offset = cursor->offset & ~PAGE_MASK; - if (cursor->last_piece) /* pagelist offset is always 0 */ + if (cursor->last_piece) *length = cursor->resid; else *length = PAGE_SIZE - *page_offset; @@ -961,7 +961,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data, cursor->resid -= bytes; cursor->offset += bytes; - /* pagelist offset is always 0 */ + /* offset of first page in pagelist is always 0 */ if (!bytes || cursor->offset & ~PAGE_MASK) return false; /* more bytes to process in the current page */ -- cgit v1.2.3 From 56fc5659162965ce3018a34c6bb8a022f3a3b33c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 30 Mar 2013 23:46:55 -0500 Subject: libceph: account for alignment in pages cursor When a cursor for a page array data message is initialized it needs to determine the initial value for cursor->last_piece. Currently it just checks if length is less than a page, but that's not correct. The data in the first page in the array will be offset by a page offset based on the alignment recorded for the data. (All pages thereafter will be aligned at the base of the page, so there's no need to account for this except for the first page.) Because this was wrong, there was a case where the length of a piece would be calculated as all of the residual bytes in the message and that plus the page offset could exceed the length of a page. So fix this case. Make sure the sum won't wrap. This resolves a third issue described in: http://tracker.ceph.com/issues/4598 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 198b9026288e..ee160864e8ea 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -839,9 +839,10 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data, page_count = calc_pages_for(data->alignment, (u64)data->length); cursor->page_offset = data->alignment & ~PAGE_MASK; cursor->page_index = 0; - BUG_ON(page_count > (int) USHRT_MAX); - cursor->page_count = (unsigned short) page_count; - cursor->last_piece = length <= PAGE_SIZE; + BUG_ON(page_count > (int)USHRT_MAX); + cursor->page_count = (unsigned short)page_count; + BUG_ON(length > SIZE_MAX - cursor->page_offset); + cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; } static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data, -- cgit v1.2.3 From 0baa1bd9b6da7161dc1773b1dfce3adfd37d675f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 29 Mar 2013 14:28:03 -0500 Subject: libceph: be explicit in masking bottom 16 bits In ceph_osdc_build_request() there is a call to cpu_to_le16() which provides a 64-bit value as its argument. Because of the implied byte swapping going on it looked pretty suspect to me. At the moment it turns out the behavior is well defined, but masking off those bottom bits explicitly eliminates this distraction, and is in fact more directly related to the purpose of the message header's data_off field. This resolves: http://tracker.ceph.com/issues/4125 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3b6657fe99b1..015bf9f64da7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -419,8 +419,18 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, p += 4; /* data */ - if (flags & CEPH_OSD_FLAG_WRITE) - req->r_request->hdr.data_off = cpu_to_le16(off); + if (flags & CEPH_OSD_FLAG_WRITE) { + u16 data_off; + + /* + * The header "data_off" is a hint to the receiver + * allowing it to align received data into its + * buffers such that there's no need to re-copy + * it before writing it to disk (direct I/O). + */ + data_off = (u16) (off & 0xffff); + req->r_request->hdr.data_off = cpu_to_le16(data_off); + } req->r_request->hdr.data_len = cpu_to_le32(data_len); BUG_ON(p > msg->front.iov_base + msg->front.iov_len); -- cgit v1.2.3 From a8dd0a37bc016cfb3ac75cf8484428573bb8d862 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:00 -0500 Subject: libceph: define osd_req_opcode_valid() Define a separate function to determine the validity of an opcode, and use it inside osd_req_encode_op() in order to unclutter that function. Don't update the destination op at all--and return zero--if an unsupported or unrecognized opcode is seen in osd_req_encode_op(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 126 ++++++++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 015bf9f64da7..4e5c0438ea35 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -220,70 +220,24 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); -static u64 osd_req_encode_op(struct ceph_osd_request *req, - struct ceph_osd_op *dst, - struct ceph_osd_req_op *src) +static bool osd_req_opcode_valid(u16 opcode) { - u64 out_data_len = 0; - struct ceph_pagelist *pagelist; - - dst->op = cpu_to_le16(src->op); - - switch (src->op) { - case CEPH_OSD_OP_STAT: - break; + switch (opcode) { case CEPH_OSD_OP_READ: - case CEPH_OSD_OP_WRITE: - if (src->op == CEPH_OSD_OP_WRITE) - out_data_len = src->extent.length; - dst->extent.offset = cpu_to_le64(src->extent.offset); - dst->extent.length = cpu_to_le64(src->extent.length); - dst->extent.truncate_size = - cpu_to_le64(src->extent.truncate_size); - dst->extent.truncate_seq = - cpu_to_le32(src->extent.truncate_seq); - break; - case CEPH_OSD_OP_CALL: - pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); - BUG_ON(!pagelist); - ceph_pagelist_init(pagelist); - - dst->cls.class_len = src->cls.class_len; - dst->cls.method_len = src->cls.method_len; - dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); - ceph_pagelist_append(pagelist, src->cls.class_name, - src->cls.class_len); - ceph_pagelist_append(pagelist, src->cls.method_name, - src->cls.method_len); - ceph_pagelist_append(pagelist, src->cls.indata, - src->cls.indata_len); - - req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGELIST; - req->r_data_out.pagelist = pagelist; - out_data_len = pagelist->length; - break; - case CEPH_OSD_OP_STARTSYNC: - break; - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_WATCH: - dst->watch.cookie = cpu_to_le64(src->watch.cookie); - dst->watch.ver = cpu_to_le64(src->watch.ver); - dst->watch.flag = src->watch.flag; - break; - default: - pr_err("unrecognized osd opcode %d\n", src->op); - WARN_ON(1); - break; + case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_MAPEXT: case CEPH_OSD_OP_MASKTRUNC: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_NOTIFY: + case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_ASSERT_VER: + case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_TRUNCATE: case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_DELETE: case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_STARTSYNC: case CEPH_OSD_OP_SETTRUNC: case CEPH_OSD_OP_TRIMTRUNC: case CEPH_OSD_OP_TMAPUP: @@ -291,11 +245,11 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, case CEPH_OSD_OP_TMAPGET: case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_ROLLBACK: + case CEPH_OSD_OP_WATCH: case CEPH_OSD_OP_OMAPGETKEYS: case CEPH_OSD_OP_OMAPGETVALS: case CEPH_OSD_OP_OMAPGETHEADER: case CEPH_OSD_OP_OMAPGETVALSBYKEYS: - case CEPH_OSD_OP_MODE_RD: case CEPH_OSD_OP_OMAPSETVALS: case CEPH_OSD_OP_OMAPSETHEADER: case CEPH_OSD_OP_OMAPCLEAR: @@ -326,13 +280,77 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, case CEPH_OSD_OP_RDUNLOCK: case CEPH_OSD_OP_UPLOCK: case CEPH_OSD_OP_DNLOCK: + case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_PGLS: case CEPH_OSD_OP_PGLS_FILTER: + return true; + default: + return false; + } +} + +static u64 osd_req_encode_op(struct ceph_osd_request *req, + struct ceph_osd_op *dst, + struct ceph_osd_req_op *src) +{ + u64 out_data_len = 0; + struct ceph_pagelist *pagelist; + + if (WARN_ON(!osd_req_opcode_valid(src->op))) { + pr_err("unrecognized osd opcode %d\n", src->op); + + return 0; + } + + switch (src->op) { + case CEPH_OSD_OP_STAT: + break; + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + if (src->op == CEPH_OSD_OP_WRITE) + out_data_len = src->extent.length; + dst->extent.offset = cpu_to_le64(src->extent.offset); + dst->extent.length = cpu_to_le64(src->extent.length); + dst->extent.truncate_size = + cpu_to_le64(src->extent.truncate_size); + dst->extent.truncate_seq = + cpu_to_le32(src->extent.truncate_seq); + break; + case CEPH_OSD_OP_CALL: + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + BUG_ON(!pagelist); + ceph_pagelist_init(pagelist); + + dst->cls.class_len = src->cls.class_len; + dst->cls.method_len = src->cls.method_len; + dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); + ceph_pagelist_append(pagelist, src->cls.class_name, + src->cls.class_len); + ceph_pagelist_append(pagelist, src->cls.method_name, + src->cls.method_len); + ceph_pagelist_append(pagelist, src->cls.indata, + src->cls.indata_len); + + req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGELIST; + req->r_data_out.pagelist = pagelist; + out_data_len = pagelist->length; + break; + case CEPH_OSD_OP_STARTSYNC: + break; + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + dst->watch.cookie = cpu_to_le64(src->watch.cookie); + dst->watch.ver = cpu_to_le64(src->watch.ver); + dst->watch.flag = src->watch.flag; + break; + default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); WARN_ON(1); - break; + + return 0; } + dst->op = cpu_to_le16(src->op); dst->payload_len = cpu_to_le32(src->payload_len); return out_data_len; -- cgit v1.2.3 From 33803f3300265661b5c5d20a9811c6a2a157d545 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:00 -0500 Subject: libceph: define source request op functions The rbd code has a function that allocates and populates a ceph_osd_req_op structure (the in-core version of an osd request operation). When reviewed, Josh suggested two things: that the big varargs function might be better split into type-specific functions; and that this functionality really belongs in the osd client rather than rbd. This patch implements both of Josh's suggestions. It breaks up the rbd function into separate functions and defines them in the osd client module as exported interfaces. Unlike the rbd version, however, the functions don't allocate an osd_req_op structure; they are provided the address of one and that is initialized instead. The rbd function has been eliminated and calls to it have been replaced by calls to the new routines. The rbd code now now use a stack (struct) variable to hold the op rather than allocating and freeing it each time. For now only the capabilities used by rbd are implemented. Implementing all the other osd op types, and making the rest of the code use it will be done separately, in the next few patches. Note that only the extent, cls, and watch portions of the ceph_osd_req_op structure are currently used. Delete the others (xattr, pgls, and snap) from its definition so nobody thinks it's actually implemented or needed. We can add it back again later if needed, when we know it's been tested. This (and a few follow-on patches) resolves: http://tracker.ceph.com/issues/3861 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 117 ++++++---------------------------------- include/linux/ceph/osd_client.h | 26 ++++----- net/ceph/osd_client.c | 84 +++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 116 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6ed508bd363a..f04d45b6b563 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1134,76 +1134,6 @@ static bool obj_request_type_valid(enum obj_request_type type) } } -static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) -{ - struct ceph_osd_req_op *op; - va_list args; - size_t size; - - op = kzalloc(sizeof (*op), GFP_NOIO); - if (!op) - return NULL; - op->op = opcode; - va_start(args, opcode); - switch (opcode) { - case CEPH_OSD_OP_READ: - case CEPH_OSD_OP_WRITE: - /* rbd_osd_req_op_create(READ, offset, length) */ - /* rbd_osd_req_op_create(WRITE, offset, length) */ - op->extent.offset = va_arg(args, u64); - op->extent.length = va_arg(args, u64); - if (opcode == CEPH_OSD_OP_WRITE) - op->payload_len = op->extent.length; - break; - case CEPH_OSD_OP_STAT: - break; - case CEPH_OSD_OP_CALL: - /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ - op->cls.class_name = va_arg(args, char *); - size = strlen(op->cls.class_name); - rbd_assert(size <= (size_t) U8_MAX); - op->cls.class_len = size; - op->payload_len = size; - - op->cls.method_name = va_arg(args, char *); - size = strlen(op->cls.method_name); - rbd_assert(size <= (size_t) U8_MAX); - op->cls.method_len = size; - op->payload_len += size; - - op->cls.argc = 0; - op->cls.indata = va_arg(args, void *); - size = va_arg(args, size_t); - rbd_assert(size <= (size_t) U32_MAX); - op->cls.indata_len = (u32) size; - op->payload_len += size; - break; - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_WATCH: - /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ - /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ - op->watch.cookie = va_arg(args, u64); - op->watch.ver = va_arg(args, u64); - op->watch.ver = cpu_to_le64(op->watch.ver); - if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) - op->watch.flag = (u8) 1; - break; - default: - rbd_warn(NULL, "unsupported opcode %hu\n", opcode); - kfree(op); - op = NULL; - break; - } - va_end(args); - - return op; -} - -static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) -{ - kfree(op); -} - static int rbd_obj_request_submit(struct ceph_osd_client *osdc, struct rbd_obj_request *obj_request) { @@ -1628,7 +1558,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, while (resid) { const char *object_name; unsigned int clone_size; - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; u64 offset; u64 length; @@ -1657,13 +1587,10 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, * request. Note that the contents of the op are * copied by rbd_osd_req_create(). */ - op = rbd_osd_req_op_create(opcode, offset, length); - if (!op) - goto out_partial; + osd_req_op_extent_init(&op, opcode, offset, length, 0, 0); obj_request->osd_req = rbd_osd_req_create(rbd_dev, img_request->write_request, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out_partial; /* status and version are initially zero-filled */ @@ -1766,7 +1693,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; struct ceph_osd_client *osdc; int ret; @@ -1776,12 +1703,9 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, return -ENOMEM; ret = -ENOMEM; - op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); - if (!op) - goto out; + osd_req_op_watch_init(&op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out; @@ -1823,7 +1747,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; int ret; rbd_assert(start ^ !!rbd_dev->watch_event); @@ -1843,14 +1767,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request) goto out_cancel; - op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, + osd_req_op_watch_init(&op, CEPH_OSD_OP_WATCH, rbd_dev->watch_event->cookie, rbd_dev->header.obj_version, start); - if (!op) - goto out_cancel; obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out_cancel; @@ -1912,7 +1833,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, { struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc; - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; struct page **pages; u32 page_count; int ret; @@ -1939,13 +1860,10 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, - method_name, outbound, outbound_size); - if (!op) - goto out; + osd_req_op_cls_init(&op, CEPH_OSD_OP_CALL, class_name, method_name, + outbound, outbound_size); obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out; @@ -2125,7 +2043,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, char *buf, u64 *version) { - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc; struct page **pages = NULL; @@ -2147,12 +2065,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length); - if (!op) - goto out; + osd_req_op_extent_init(&op, CEPH_OSD_OP_READ, offset, length, 0, 0); obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 1dab291b2dc6..5fd2cbfcfd91 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -201,14 +201,6 @@ struct ceph_osd_req_op { u64 truncate_size; u32 truncate_seq; } extent; - struct { - const char *name; - const void *val; - u32 name_len; - u32 value_len; - __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ - __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ - } xattr; struct { const char *class_name; const char *method_name; @@ -218,13 +210,6 @@ struct ceph_osd_req_op { __u8 method_len; __u8 argc; } cls; - struct { - u64 cookie; - u64 count; - } pgls; - struct { - u64 snapid; - } snap; struct { u64 cookie; u64 ver; @@ -244,6 +229,17 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode); +extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq); +extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, + const char *class, const char *method, + const void *request_data, + size_t request_data_size); +extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, + u64 cookie, u64 version, int flag); + extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_op, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4e5c0438ea35..02ed72820479 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -289,6 +289,90 @@ static bool osd_req_opcode_valid(u16 opcode) } } +/* + * This is an osd op init function for opcodes that have no data or + * other information associated with them. It also serves as a + * common init routine for all the other init functions, below. + */ +void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode) +{ + BUG_ON(!osd_req_opcode_valid(opcode)); + + memset(op, 0, sizeof (*op)); + + op->op = opcode; +} + +void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq) +{ + size_t payload_len = 0; + + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); + + osd_req_op_init(op, opcode); + + op->extent.offset = offset; + op->extent.length = length; + op->extent.truncate_size = truncate_size; + op->extent.truncate_seq = truncate_seq; + if (opcode == CEPH_OSD_OP_WRITE) + payload_len += length; + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_extent_init); + +void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, + const char *class, const char *method, + const void *request_data, size_t request_data_size) +{ + size_t payload_len = 0; + size_t size; + + BUG_ON(opcode != CEPH_OSD_OP_CALL); + + osd_req_op_init(op, opcode); + + op->cls.class_name = class; + size = strlen(class); + BUG_ON(size > (size_t) U8_MAX); + op->cls.class_len = size; + payload_len += size; + + op->cls.method_name = method; + size = strlen(method); + BUG_ON(size > (size_t) U8_MAX); + op->cls.method_len = size; + payload_len += size; + + op->cls.indata = request_data; + BUG_ON(request_data_size > (size_t) U32_MAX); + op->cls.indata_len = (u32) request_data_size; + payload_len += request_data_size; + + op->cls.argc = 0; /* currently unused */ + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_cls_init); + +void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, + u64 cookie, u64 version, int flag) +{ + BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + + osd_req_op_init(op, opcode); + + op->watch.cookie = cookie; + /* op->watch.ver = version; */ /* XXX 3847 */ + op->watch.ver = cpu_to_le64(version); + if (opcode == CEPH_OSD_OP_WATCH && flag) + op->watch.flag = (u8) 1; +} +EXPORT_SYMBOL(osd_req_op_watch_init); + static u64 osd_req_encode_op(struct ceph_osd_request *req, struct ceph_osd_op *dst, struct ceph_osd_req_op *src) -- cgit v1.2.3 From 75d1c941e57d4247de4c0ed4064a65cf1a4d3ed8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:00 -0500 Subject: libceph: pass offset and length out of calc_layout() The purpose of calc_layout() is to determine, given a file offset and length and a layout describing the placement of file data across objects, where in "object space" that data resides. Specifically, it determines which object should hold the first part of the specified range of file data, and the offset and length of data within that object. The length will not exceed the bounds of the object, and the caller is informed of that maximum length. Add two parameters to calc_layout() to allow the object-relative offset and length to be passed back to the caller. This is the first steps toward having ceph_osdc_new_request() build its osd op structure using osd_req_op_extent_init(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 02ed72820479..f782aca54daa 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -64,32 +64,31 @@ static int op_has_extent(int op) * fill osd op in request message. */ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, - struct ceph_osd_req_op *op, u64 *bno) + struct ceph_osd_req_op *op, u64 *objnum, + u64 *objoff, u64 *objlen) { u64 orig_len = *plen; - u64 objoff = 0; - u64 objlen = 0; int r; /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, - &objoff, &objlen); + r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum, + objoff, objlen); if (r < 0) return r; - if (objlen < orig_len) { - *plen = objlen; + if (*objlen < orig_len) { + *plen = *objlen; dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); } if (op_has_extent(op->op)) { u32 osize = le32_to_cpu(layout->fl_object_size); - op->extent.offset = objoff; - op->extent.length = objlen; - if (op->extent.truncate_size <= off - objoff) { + op->extent.offset = *objoff; + op->extent.length = *objlen; + if (op->extent.truncate_size <= off - *objoff) { op->extent.truncate_size = 0; } else { - op->extent.truncate_size -= off - objoff; + op->extent.truncate_size -= off - *objoff; if (op->extent.truncate_size > osize) op->extent.truncate_size = osize; } @@ -97,7 +96,7 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, if (op->op == CEPH_OSD_OP_WRITE) op->payload_len = *plen; - dout("calc_layout bno=%llx %llu~%llu\n", *bno, objoff, objlen); + dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); return 0; } @@ -572,7 +571,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_osd_req_op ops[2]; struct ceph_osd_request *req; unsigned int num_op = 1; - u64 bno = 0; + u64 objnum = 0; + u64 objoff = 0; + u64 objlen = 0; int r; memset(&ops, 0, sizeof ops); @@ -593,14 +594,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_flags = flags; /* calculate max write size */ - r = calc_layout(layout, off, plen, ops, &bno); + r = calc_layout(layout, off, plen, ops, &objnum, &objoff, &objlen); if (r < 0) { ceph_osdc_put_request(req); return ERR_PTR(r); } req->r_file_layout = *layout; /* keep a copy */ - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", + vino.ino, objnum); req->r_oid_len = strlen(req->r_oid); ceph_osdc_build_request(req, off, num_op, ops, -- cgit v1.2.3 From a19dadfba91c73a12a666e6fdb9e242f325df825 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:01 -0500 Subject: libceph: don't update op in calc_layout() The ceph_osdc_new_request() an array of osd operations is built up and filled in partially within that function and partially in the called function calc_layout(). Move the latter part back out to ceph_osdc_new_request() so it's all done in one place. This makes it unnecessary to pass the op pointer to calc_layout(), so get rid of that parameter. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f782aca54daa..0eb417b44195 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -64,8 +64,7 @@ static int op_has_extent(int op) * fill osd op in request message. */ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, - struct ceph_osd_req_op *op, u64 *objnum, - u64 *objoff, u64 *objlen) + u64 *objnum, u64 *objoff, u64 *objlen) { u64 orig_len = *plen; int r; @@ -81,21 +80,6 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, orig_len - *plen, off, *plen); } - if (op_has_extent(op->op)) { - u32 osize = le32_to_cpu(layout->fl_object_size); - op->extent.offset = *objoff; - op->extent.length = *objlen; - if (op->extent.truncate_size <= off - *objoff) { - op->extent.truncate_size = 0; - } else { - op->extent.truncate_size -= off - *objoff; - if (op->extent.truncate_size > osize) - op->extent.truncate_size = osize; - } - } - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = *plen; - dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); return 0; @@ -594,11 +578,27 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_flags = flags; /* calculate max write size */ - r = calc_layout(layout, off, plen, ops, &objnum, &objoff, &objlen); + r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); if (r < 0) { ceph_osdc_put_request(req); return ERR_PTR(r); } + + if (op_has_extent(ops[0].op)) { + u32 osize = le32_to_cpu(layout->fl_object_size); + ops[0].extent.offset = objoff; + ops[0].extent.length = objlen; + if (ops[0].extent.truncate_size <= off - objoff) { + ops[0].extent.truncate_size = 0; + } else { + ops[0].extent.truncate_size -= off - objoff; + if (ops[0].extent.truncate_size > osize) + ops[0].extent.truncate_size = osize; + } + } + if (ops[0].op == CEPH_OSD_OP_WRITE) + ops[0].payload_len = *plen; + req->r_file_layout = *layout; /* keep a copy */ snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", -- cgit v1.2.3 From d18d1e2807f38a94839be1f83682e17011f53322 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:01 -0500 Subject: libceph: clean up ceph_osd_new_request() All callers of ceph_osd_new_request() pass either CEPH_OSD_OP_READ or CEPH_OSD_OP_WRITE as the opcode value. The function assumes it by filling in the extent fields in the ops array it builds. So just assert that is the case, and don't bother calling op_has_extent() before filling in the first osd operation in the array. Define some local variables to gather the information to fill into the first op, and then fill in the op array all in one place. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0eb417b44195..7136060a0501 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -32,12 +32,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, static void __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); -static int op_has_extent(int op) -{ - return (op == CEPH_OSD_OP_READ || - op == CEPH_OSD_OP_WRITE); -} - /* * Implement client access to distributed object storage cluster. * @@ -554,22 +548,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, { struct ceph_osd_req_op ops[2]; struct ceph_osd_request *req; - unsigned int num_op = 1; + unsigned int num_op = do_sync ? 2 : 1; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; + u32 object_size; + u64 object_base; int r; - memset(&ops, 0, sizeof ops); - - ops[0].op = opcode; - ops[0].extent.truncate_seq = truncate_seq; - ops[0].extent.truncate_size = truncate_size; - - if (do_sync) { - ops[1].op = CEPH_OSD_OP_STARTSYNC; - num_op++; - } + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, GFP_NOFS); @@ -584,21 +571,28 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, return ERR_PTR(r); } - if (op_has_extent(ops[0].op)) { - u32 osize = le32_to_cpu(layout->fl_object_size); - ops[0].extent.offset = objoff; - ops[0].extent.length = objlen; - if (ops[0].extent.truncate_size <= off - objoff) { - ops[0].extent.truncate_size = 0; - } else { - ops[0].extent.truncate_size -= off - objoff; - if (ops[0].extent.truncate_size > osize) - ops[0].extent.truncate_size = osize; - } + object_size = le32_to_cpu(layout->fl_object_size); + object_base = off - objoff; + if (truncate_size <= object_base) { + truncate_size = 0; + } else { + truncate_size -= object_base; + if (truncate_size > object_size) + truncate_size = object_size; } + + memset(&ops, 0, sizeof ops); + ops[0].op = opcode; + ops[0].extent.offset = objoff; + ops[0].extent.length = objlen; + ops[0].extent.truncate_size = truncate_size; + ops[0].extent.truncate_seq = truncate_seq; if (ops[0].op == CEPH_OSD_OP_WRITE) ops[0].payload_len = *plen; + if (do_sync) + ops[1].op = CEPH_OSD_OP_STARTSYNC; + req->r_file_layout = *layout; /* keep a copy */ snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", -- cgit v1.2.3 From b0270324c5a9a5157f565c2de34fb1071cfdce7c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:01 -0500 Subject: libceph: use osd_req_op_extent_init() Use osd_req_op_extent_init() in ceph_osdc_new_request() to initialize the one or two ops built in that function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7136060a0501..55f7c9a57a43 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -581,17 +581,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size = object_size; } - memset(&ops, 0, sizeof ops); - ops[0].op = opcode; - ops[0].extent.offset = objoff; - ops[0].extent.length = objlen; - ops[0].extent.truncate_size = truncate_size; - ops[0].extent.truncate_seq = truncate_seq; - if (ops[0].op == CEPH_OSD_OP_WRITE) - ops[0].payload_len = *plen; - + osd_req_op_extent_init(&ops[0], opcode, objoff, objlen, + truncate_size, truncate_seq); if (do_sync) - ops[1].op = CEPH_OSD_OP_STARTSYNC; + osd_req_op_init(&ops[1], CEPH_OSD_OP_STARTSYNC); req->r_file_layout = *layout; /* keep a copy */ -- cgit v1.2.3 From 8058fd45039724695d5b67a574544452635d64a9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 1 Apr 2013 18:58:26 -0500 Subject: libceph: drop mutex on error in handle_reply() The osd client mutex is acquired just before getting a reference to a request in handle_reply(). However the error paths after that don't drop the mutex before returning as they should. Drop the mutex after dropping the request reference. Also add a bad_mutex label at that point and use it so the failed request lookup case can be handled with the rest. This resolves: http://tracker.ceph.com/issues/4615 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 55f7c9a57a43..69ef6539ca14 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1337,8 +1337,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, req = __lookup_request(osdc, tid); if (req == NULL) { dout("handle_reply tid %llu dne\n", tid); - mutex_unlock(&osdc->request_mutex); - return; + goto bad_mutex; } ceph_osdc_get_request(req); @@ -1437,6 +1436,8 @@ done: bad_put: ceph_osdc_put_request(req); +bad_mutex: + mutex_unlock(&osdc->request_mutex); bad: pr_err("corrupt osd_op_reply got %d %d\n", (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); -- cgit v1.2.3 From ef4859d6479d19bcc65c3156cf3b7dd747355c29 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 1 Apr 2013 18:58:26 -0500 Subject: libceph: define ceph_decode_pgid() only once There are two basically identical definitions of __decode_pgid() in libceph, one in "net/ceph/osdmap.c" and the other in "net/ceph/osd_client.c". Get rid of both, and instead define a single inline version in "include/linux/ceph/osdmap.h". Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osdmap.h | 24 ++++++++++++++++++++++++ net/ceph/osd_client.c | 22 +--------------------- net/ceph/osdmap.c | 22 ++-------------------- 3 files changed, 27 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 167daf60c4e8..d05cc4451af6 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -119,6 +120,29 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, return &map->osd_addr[osd]; } +static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) +{ + __u8 version; + + if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { + pr_warning("incomplete pg encoding"); + + return -EINVAL; + } + version = ceph_decode_8(p); + if (version > 1) { + pr_warning("do not understand pg encoding %d > 1", + (int)version); + return -EINVAL; + } + + pgid->pool = ceph_decode_64(p); + pgid->seed = ceph_decode_32(p); + *p += 4; /* skip deprecated preferred value */ + + return 0; +} + extern struct ceph_osdmap *osdmap_decode(void **p, void *end); extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 69ef6539ca14..ca79cad50840 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1268,26 +1268,6 @@ static void complete_request(struct ceph_osd_request *req) complete_all(&req->r_safe_completion); /* fsync waiter */ } -static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid) -{ - __u8 v; - - ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad); - v = ceph_decode_8(p); - if (v > 1) { - pr_warning("do not understand pg encoding %d > 1", v); - return -EINVAL; - } - pgid->pool = ceph_decode_64(p); - pgid->seed = ceph_decode_32(p); - *p += 4; - return 0; - -bad: - pr_warning("incomplete pg encoding"); - return -EINVAL; -} - /* * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. @@ -1321,7 +1301,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, ceph_decode_need(&p, end, object_len, bad); p += object_len; - err = __decode_pgid(&p, end, &pg); + err = ceph_decode_pgid(&p, end, &pg); if (err) goto bad; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 09898711f2fd..603ddd92db19 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -654,24 +654,6 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) return 0; } -static int __decode_pgid(void **p, void *end, struct ceph_pg *pg) -{ - u8 v; - - ceph_decode_need(p, end, 1+8+4+4, bad); - v = ceph_decode_8(p); - if (v != 1) - goto bad; - pg->pool = ceph_decode_64(p); - pg->seed = ceph_decode_32(p); - *p += 4; /* skip preferred */ - return 0; - -bad: - dout("error decoding pgid\n"); - return -EINVAL; -} - /* * decode a full map. */ @@ -765,7 +747,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) struct ceph_pg pgid; struct ceph_pg_mapping *pg; - err = __decode_pgid(p, end, &pgid); + err = ceph_decode_pgid(p, end, &pgid); if (err) goto bad; ceph_decode_need(p, end, sizeof(u32), bad); @@ -983,7 +965,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_pg pgid; u32 pglen; - err = __decode_pgid(p, end, &pgid); + err = ceph_decode_pgid(p, end, &pgid); if (err) goto bad; ceph_decode_need(p, end, sizeof(u32), bad); -- cgit v1.2.3 From ace6d3a96f00c271b3f337adcde8e8cbe39c3820 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 1 Apr 2013 16:12:14 -0500 Subject: libceph: drop ceph_osd_request->r_con_filling_msg A field in an osd request keeps track of whether a connection is currently filling the request's reply message. This patch gets rid of that field. An osd request includes two messages--a request and a reply--and they're both associated with the connection that existed to its the target osd at the time the request was created. An osd request can be dropped early, even when it's in flight. And at that time both messages are released. It's possible the reply message has been supplied to its connection to receive an incoming response message at the time the osd request gets dropped. So ceph_osdc_release_request() revokes that message from the connection before releasing it so things get cleaned up properly. Previously this may have caused a problem, because the connection that a message was associated with might have gone away before the revoke request. And to avoid any problems using that connection, the osd client held a reference to it when it supplies its response message. However since this commit: 38941f80 libceph: have messages point to their connection all messages hold a reference to the connection they are associated with whenever the connection is actively operating on the message (i.e. while the message is queued to send or sending, and when it data is being received into it). And if a message has no connection associated with it, ceph_msg_revoke_incoming() won't do anything when asked to revoke it. As a result, there is no need to keep an additional reference to the connection associated with a message when we hand the message to the messenger when it calls our alloc_msg() method to receive something. If the connection *were* operating on it, it would have its own reference, and if not, there's no work to be done when we need to revoke it. So get rid of the osd request's r_con_filling_msg field. This resolves: http://tracker.ceph.com/issues/4647 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 2 -- net/ceph/osd_client.c | 29 +++++------------------------ 2 files changed, 5 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 5fd2cbfcfd91..3b5ba31c2cbd 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -89,8 +89,6 @@ struct ceph_osd_request { int r_pg_osds[CEPH_PG_MAX_SIZE]; int r_num_pg_osds; - struct ceph_connection *r_con_filling_msg; - struct ceph_msg *r_request, *r_reply; int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ca79cad50840..e0887923e5ab 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -91,15 +91,10 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_con_filling_msg) { - dout("%s revoking msg %p from con %p\n", __func__, - req->r_reply, req->r_con_filling_msg); + if (req->r_reply) { ceph_msg_revoke_incoming(req->r_reply); - req->r_con_filling_msg->ops->put(req->r_con_filling_msg); - req->r_con_filling_msg = NULL; - } - if (req->r_reply) ceph_msg_put(req->r_reply); + } if (req->r_data_in.type == CEPH_OSD_DATA_TYPE_PAGES && req->r_data_in.own_pages) { @@ -1353,16 +1348,6 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, for (i = 0; i < numops; i++) req->r_reply_op_result[i] = ceph_decode_32(&p); - /* - * if this connection filled our message, drop our reference now, to - * avoid a (safe but slower) revoke later. - */ - if (req->r_con_filling_msg == con && req->r_reply == msg) { - dout(" dropping con_filling_msg ref %p\n", con); - req->r_con_filling_msg = NULL; - con->ops->put(con); - } - if (!req->r_got_reply) { unsigned int bytes; @@ -2199,13 +2184,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, goto out; } - if (req->r_con_filling_msg) { + if (req->r_reply->con) dout("%s revoking msg %p from old con %p\n", __func__, - req->r_reply, req->r_con_filling_msg); - ceph_msg_revoke_incoming(req->r_reply); - req->r_con_filling_msg->ops->put(req->r_con_filling_msg); - req->r_con_filling_msg = NULL; - } + req->r_reply, req->r_reply->con); + ceph_msg_revoke_incoming(req->r_reply); if (front > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d\n", @@ -2236,7 +2218,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } } *skip = 0; - req->r_con_filling_msg = con->ops->get(con); dout("get_reply tid %lld %p\n", tid, m); out: -- cgit v1.2.3 From a19308048182d5f9e16b03b1d1c038d9346c7589 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: record message data length Keep track of the length of the data portion for a message in a separate field in the ceph_msg structure. This information has been maintained in wire byte order in the message header, but that's going to change soon. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 4 +++- net/ceph/messenger.c | 10 +++++++++- net/ceph/osd_client.c | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 3181321bed6d..b832c0ce899a 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -139,6 +139,7 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; + size_t data_length; struct ceph_msg_data *data; /* data payload */ struct ceph_connection *con; @@ -270,7 +271,8 @@ extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment); extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); -extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio); +extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, + size_t length); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ee160864e8ea..fa9b4d0243a0 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2981,6 +2981,7 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, BUG_ON(!pages); BUG_ON(!length); + BUG_ON(msg->data_length); BUG_ON(msg->data != NULL); data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); @@ -2990,6 +2991,7 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, data->alignment = alignment & ~PAGE_MASK; msg->data = data; + msg->data_length = length; } EXPORT_SYMBOL(ceph_msg_data_set_pages); @@ -3000,6 +3002,7 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, BUG_ON(!pagelist); BUG_ON(!pagelist->length); + BUG_ON(msg->data_length); BUG_ON(msg->data != NULL); data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); @@ -3007,14 +3010,17 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, data->pagelist = pagelist; msg->data = data; + msg->data_length = pagelist->length; } EXPORT_SYMBOL(ceph_msg_data_set_pagelist); -void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio) +void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, + size_t length) { struct ceph_msg_data *data; BUG_ON(!bio); + BUG_ON(msg->data_length); BUG_ON(msg->data != NULL); data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); @@ -3022,6 +3028,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio) data->bio = bio; msg->data = data; + msg->data_length = length; } EXPORT_SYMBOL(ceph_msg_data_set_bio); @@ -3200,6 +3207,7 @@ void ceph_msg_last_put(struct kref *kref) } ceph_msg_data_destroy(m->data); m->data = NULL; + m->data_length = 0; if (m->pool) ceph_msgpool_put(m->pool, m); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index e0887923e5ab..0b4951e27532 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1848,7 +1848,7 @@ static void ceph_osdc_msg_data_set(struct ceph_msg *msg, ceph_msg_data_set_pagelist(msg, osd_data->pagelist); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - ceph_msg_data_set_bio(msg, osd_data->bio); + ceph_msg_data_set_bio(msg, osd_data->bio, osd_data->bio_length); #endif } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); -- cgit v1.2.3 From acead002b200569273bed331c93c4a91d25e10b8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:05 -0500 Subject: libceph: don't build request in ceph_osdc_new_request() This patch moves the call to ceph_osdc_build_request() out of ceph_osdc_new_request() and into its caller. This is in order to defer formatting osd operation information into the request message until just before request is started. The only unusual (ab)user of ceph_osdc_build_request() is ceph_writepages_start(), where the final length of write request may change (downward) based on the current inode size or the oldest snapshot context with dirty data for the inode. The remaining callers don't change anything in the request after has been built. This means the ops array is now supplied by the caller. It also means there is no need to pass the mtime to ceph_osdc_new_request() (it gets provided to ceph_osdc_build_request()). And rather than passing a do_sync flag, have the number of ops in the ops array supplied imply adding a second STARTSYNC operation after the READ or WRITE requested. This and some of the patches that follow are related to having the messenger (only) be responsible for filling the content of the message header, as described here: http://tracker.ceph.com/issues/4589 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/addr.c | 36 +++++++++++++++++++++++------------- fs/ceph/file.c | 20 +++++++++++++------- include/linux/ceph/osd_client.h | 12 ++++++------ net/ceph/osd_client.c | 40 +++++++++++++++++++++------------------- 4 files changed, 63 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ae438d02a422..681463d5459b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -284,7 +284,9 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) &ceph_inode_to_client(inode)->client->osdc; struct ceph_inode_info *ci = ceph_inode(inode); struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_vino vino; struct ceph_osd_request *req; + struct ceph_osd_req_op op; u64 off; u64 len; int i; @@ -308,16 +310,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) len = nr_pages << PAGE_CACHE_SHIFT; dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, off, len); - - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), - off, &len, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, + vino = ceph_vino(inode); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, + 1, &op, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, - NULL, false); + false); if (IS_ERR(req)) return PTR_ERR(req); + ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); + /* build page vector */ nr_pages = calc_pages_for(0, len); pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); @@ -736,6 +739,7 @@ retry: last_snapc = snapc; while (!done && index <= end) { + struct ceph_osd_req_op ops[2]; unsigned i; int first; pgoff_t next; @@ -825,20 +829,22 @@ get_more_pages: /* ok */ if (locked_pages == 0) { + struct ceph_vino vino; + int num_ops = do_sync ? 2 : 1; + /* prepare async write request */ offset = (u64) page_offset(page); len = wsize; + vino = ceph_vino(inode); + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, - ceph_vino(inode), - offset, &len, + &ci->i_layout, vino, offset, &len, + num_ops, ops, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - snapc, do_sync, - ci->i_truncate_seq, - ci->i_truncate_size, - &inode->i_mtime, true); + snapc, ci->i_truncate_seq, + ci->i_truncate_size, true); if (IS_ERR(req)) { rc = PTR_ERR(req); @@ -846,6 +852,10 @@ get_more_pages: break; } + ceph_osdc_build_request(req, offset, + num_ops, ops, snapc, vino.snap, + &inode->i_mtime); + req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data_out.length = len; req->r_data_out.alignment = 0; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index aeafa67bfe99..3d6dcf23b4ad 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -475,14 +475,17 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; struct ceph_osd_request *req; + struct ceph_osd_req_op ops[2]; + int num_ops = 1; struct page **pages; int num_pages; long long unsigned pos; u64 len; int written = 0; int flags; - int do_sync = 0; int check_caps = 0; int page_align, io_align; unsigned long buf_align; @@ -516,7 +519,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) flags |= CEPH_OSD_FLAG_ACK; else - do_sync = 1; + num_ops++; /* Also include a 'startsync' command. */ /* * we may need to do multiple writes here if we span an object @@ -527,16 +530,19 @@ more: buf_align = (unsigned long)data & ~PAGE_MASK; len = left; + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - ceph_vino(inode), pos, &len, - CEPH_OSD_OP_WRITE, flags, - ci->i_snap_realm->cached_context, - do_sync, + vino, pos, &len, num_ops, ops, + CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false); + false); if (IS_ERR(req)) return PTR_ERR(req); + ceph_osdc_build_request(req, pos, num_ops, ops, + snapc, vino.snap, &mtime); + /* write from beginning of first page, regardless of io alignment */ page_align = file->f_flags & O_DIRECT ? buf_align : io_align; num_pages = calc_pages_for(page_align, len); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fdda93ebbb4c..ffaf9076fdc4 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -243,12 +243,12 @@ extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - unsigned int num_op, + unsigned int num_ops, bool use_mempool, gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - unsigned int num_op, + unsigned int num_ops, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, @@ -257,11 +257,11 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 offset, u64 *len, int op, int flags, + u64 offset, u64 *len, + int num_ops, struct ceph_osd_req_op *ops, + int opcode, int flags, struct ceph_snap_context *snapc, - int do_sync, u32 truncate_seq, - u64 truncate_size, - struct timespec *mtime, + u32 truncate_seq, u64 truncate_size, bool use_mempool); extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0b4951e27532..115790aac30a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -512,9 +512,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, msg->front.iov_len = msg_size; msg->hdr.front_len = cpu_to_le32(msg_size); - dout("build_request msg_size was %d num_ops %d\n", (int)msg_size, - num_ops); - return; + dout("build_request msg_size was %d\n", (int)msg_size); } EXPORT_SYMBOL(ceph_osdc_build_request); @@ -532,18 +530,15 @@ EXPORT_SYMBOL(ceph_osdc_build_request); struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 off, u64 *plen, + u64 off, u64 *plen, int num_ops, + struct ceph_osd_req_op *ops, int opcode, int flags, struct ceph_snap_context *snapc, - int do_sync, u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, bool use_mempool) { - struct ceph_osd_req_op ops[2]; struct ceph_osd_request *req; - unsigned int num_op = do_sync ? 2 : 1; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; @@ -553,7 +548,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); - req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, + req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); @@ -578,7 +573,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, osd_req_op_extent_init(&ops[0], opcode, objoff, objlen, truncate_size, truncate_seq); - if (do_sync) + /* + * A second op in the ops array means the caller wants to + * also issue a include a 'startsync' command so that the + * osd will flush data quickly. + */ + if (num_ops > 1) osd_req_op_init(&ops[1], CEPH_OSD_OP_STARTSYNC); req->r_file_layout = *layout; /* keep a copy */ @@ -587,9 +587,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, vino.ino, objnum); req->r_oid_len = strlen(req->r_oid); - ceph_osdc_build_request(req, off, num_op, ops, - snapc, vino.snap, mtime); - return req; } EXPORT_SYMBOL(ceph_osdc_new_request); @@ -2047,17 +2044,20 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, { struct ceph_osd_request *req; struct ceph_osd_data *osd_data; + struct ceph_osd_req_op op; int rc = 0; dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, &op, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, truncate_seq, truncate_size, NULL, + NULL, truncate_seq, truncate_size, false); if (IS_ERR(req)) return PTR_ERR(req); + ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); + /* it may be a short read due to an object boundary */ osd_data = &req->r_data_in; @@ -2092,19 +2092,21 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, { struct ceph_osd_request *req; struct ceph_osd_data *osd_data; + struct ceph_osd_req_op op; int rc = 0; int page_align = off & ~PAGE_MASK; - BUG_ON(vino.snap != CEPH_NOSNAP); - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, + BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, &op, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - snapc, 0, - truncate_seq, truncate_size, mtime, + snapc, truncate_seq, truncate_size, true); if (IS_ERR(req)) return PTR_ERR(req); + ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime); + /* it may be a short write due to an object boundary */ osd_data = &req->r_data_out; osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; -- cgit v1.2.3 From 02ee07d3002e6c0b0c4ea1982cd7e6aeca203ed6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: hold off building osd request Defer building the osd request until just before submitting it in all callers except ceph_writepages_start(). (That caller will be handed in the next patch.) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/addr.c | 4 ++-- fs/ceph/file.c | 7 ++++--- net/ceph/osd_client.c | 8 ++++---- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7b6d9b22e254..0a3d2ce89660 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -319,8 +319,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) if (IS_ERR(req)) return PTR_ERR(req); - ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); - /* build page vector */ nr_pages = calc_pages_for(0, len); pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); @@ -351,6 +349,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) req->r_callback = finish_read; req->r_inode = inode; + ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3d6dcf23b4ad..47826c2ef511 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -540,9 +540,6 @@ more: if (IS_ERR(req)) return PTR_ERR(req); - ceph_osdc_build_request(req, pos, num_ops, ops, - snapc, vino.snap, &mtime); - /* write from beginning of first page, regardless of io alignment */ page_align = file->f_flags & O_DIRECT ? buf_align : io_align; num_pages = calc_pages_for(page_align, len); @@ -583,6 +580,10 @@ more: req->r_data_out.alignment = page_align; req->r_inode = inode; + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, num_ops, ops, + snapc, vino.snap, &mtime); + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { if (req->r_safe_callback) { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 115790aac30a..9ca693d0df19 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2056,8 +2056,6 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, if (IS_ERR(req)) return PTR_ERR(req); - ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); - /* it may be a short read due to an object boundary */ osd_data = &req->r_data_in; @@ -2069,6 +2067,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", off, *plen, osd_data->length, page_align); + ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); + rc = ceph_osdc_start_request(osdc, req, false); if (!rc) rc = ceph_osdc_wait_request(osdc, req); @@ -2105,8 +2105,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, if (IS_ERR(req)) return PTR_ERR(req); - ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime); - /* it may be a short write due to an object boundary */ osd_data = &req->r_data_out; osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; @@ -2115,6 +2113,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, osd_data->alignment = page_align; dout("writepages %llu~%llu (%llu bytes)\n", off, len, osd_data->length); + ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime); + rc = ceph_osdc_start_request(osdc, req, true); if (!rc) rc = ceph_osdc_wait_request(osdc, req); -- cgit v1.2.3 From e5975c7c8eb6aeab8d2f76a98c368081082795e0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:05 -0500 Subject: ceph: build osd request message later for writepages Hold off building the osd request message in ceph_writepages_start() until just before it will be submitted to the osd client for execution. We'll still create the request and allocate the page pointer array after we learn we have at least one page to write. A local variable will be used to keep track of the allocated array of pages. Wait until just before submitting the request for assigning that page array pointer to the request message. Create ands use a new function osd_req_op_extent_update() whose purpose is to serve this one spot where the length value supplied when an osd request's op was initially formatted might need to get changed (reduced, never increased) before submitting the request. Previously, ceph_writepages_start() assigned the message header's data length because of this update. That's no longer necessary, because ceph_osdc_build_request() will recalculate the right value to use based on the content of the ops in the request. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/addr.c | 59 +++++++++++++++++++++++------------------ include/linux/ceph/osd_client.h | 1 + net/ceph/osd_client.c | 13 +++++++++ 3 files changed, 47 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 0a3d2ce89660..5d8ce79385ed 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -737,10 +737,14 @@ retry: while (!done && index <= end) { struct ceph_osd_req_op ops[2]; + int num_ops = do_sync ? 2 : 1; + struct ceph_vino vino; unsigned i; int first; pgoff_t next; int pvec_pages, locked_pages; + struct page **pages = NULL; + mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; int want; u64 offset, len; @@ -824,16 +828,19 @@ get_more_pages: break; } - /* ok */ + /* + * We have something to write. If this is + * the first locked page this time through, + * allocate an osd request and a page array + * that it will use. + */ if (locked_pages == 0) { - struct ceph_vino vino; - int num_ops = do_sync ? 2 : 1; size_t size; - struct page **pages; - mempool_t *pool = NULL; + + BUG_ON(pages); /* prepare async write request */ - offset = (u64) page_offset(page); + offset = (u64)page_offset(page); len = wsize; req = ceph_writepages_osd_request(inode, offset, &len, snapc, @@ -845,11 +852,6 @@ get_more_pages: break; } - vino = ceph_vino(inode); - ceph_osdc_build_request(req, offset, - num_ops, ops, snapc, vino.snap, - &inode->i_mtime); - req->r_callback = writepages_finish; req->r_inode = inode; @@ -858,16 +860,9 @@ get_more_pages: pages = kmalloc(size, GFP_NOFS); if (!pages) { pool = fsc->wb_pagevec_pool; - pages = mempool_alloc(pool, GFP_NOFS); - WARN_ON(!pages); + BUG_ON(!pages); } - - req->r_data_out.pages = pages; - req->r_data_out.pages_from_pool = !!pool; - req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data_out.length = len; - req->r_data_out.alignment = 0; } /* note position of first page in pvec */ @@ -885,7 +880,7 @@ get_more_pages: } set_page_writeback(page); - req->r_data_out.pages[locked_pages] = page; + pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -914,18 +909,30 @@ get_more_pages: pvec.nr -= i-first; } - /* submit the write */ - offset = page_offset(req->r_data_out.pages[0]); + /* Format the osd request message and submit the write */ + + offset = page_offset(pages[0]); len = min((snap_size ? snap_size : i_size_read(inode)) - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - /* revise final length, page count */ + req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; + req->r_data_out.pages = pages; req->r_data_out.length = len; - req->r_request_ops[0].extent.length = cpu_to_le64(len); - req->r_request_ops[0].payload_len = cpu_to_le32(len); - req->r_request->hdr.data_len = cpu_to_le32(len); + req->r_data_out.alignment = 0; + req->r_data_out.pages_from_pool = !!pool; + + pages = NULL; /* request message now owns the pages array */ + pool = NULL; + + /* Update the write op length in case we changed it */ + + osd_req_op_extent_update(&ops[0], len); + + vino = ceph_vino(inode); + ceph_osdc_build_request(req, offset, num_ops, ops, + snapc, vino.snap, &inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ffaf9076fdc4..5ee1a3776b4b 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -234,6 +234,7 @@ extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode); extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq); +extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length); extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, const char *class, const char *method, const void *request_data, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9ca693d0df19..426ca1f2a721 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -296,6 +296,19 @@ void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, } EXPORT_SYMBOL(osd_req_op_extent_init); +void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length) +{ + u64 previous = op->extent.length; + + if (length == previous) + return; /* Nothing to do */ + BUG_ON(length > previous); + + op->extent.length = length; + op->payload_len -= previous - length; +} +EXPORT_SYMBOL(osd_req_op_extent_update); + void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size) -- cgit v1.2.3 From 98fa5dd883aadbb0020b68d0f9367ba152dfe511 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 2 Apr 2013 12:09:50 -0500 Subject: libceph: provide data length when preparing message In prepare_message_data(), the length used to initialize the cursor is taken from the header of the message provided. I'm working toward not using the header data length field to determine length in outbound messages, and this is a step in that direction. For inbound messages this will be set to be the actual number of bytes that are arriving (which may be less than the total size of the data buffer available). This resolves: http://tracker.ceph.com/issues/4589 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index fa9b4d0243a0..a6fda9532102 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1076,18 +1076,14 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) return new_piece; } -static void prepare_message_data(struct ceph_msg *msg) +static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { - size_t data_len; - BUG_ON(!msg); - - data_len = le32_to_cpu(msg->hdr.data_len); BUG_ON(!data_len); /* Initialize data cursor */ - ceph_msg_data_cursor_init(msg->data, data_len); + ceph_msg_data_cursor_init(msg->data, (size_t)data_len); } /* @@ -1150,11 +1146,12 @@ static void prepare_write_message(struct ceph_connection *con) m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; } + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); - dout("prepare_write_message %p seq %lld type %d len %d+%d+%d\n", + dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - le32_to_cpu(m->hdr.data_len)); + m->data_length); BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ @@ -1185,8 +1182,8 @@ static void prepare_write_message(struct ceph_connection *con) /* is there a data payload? */ con->out_msg->footer.data_crc = 0; - if (m->hdr.data_len) { - prepare_message_data(con->out_msg); + if (m->data_length) { + prepare_message_data(con->out_msg, m->data_length); con->out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ @@ -2231,7 +2228,7 @@ static int read_partial_message(struct ceph_connection *con) /* prepare for data payload, if any */ if (data_len) - prepare_message_data(con->in_msg); + prepare_message_data(con->in_msg, data_len); } /* front */ -- cgit v1.2.3 From 9fc6e0647180f72392f03a29863b6602e22aa024 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:57 -0500 Subject: libceph: compute incoming bytes once This is a simple change, extracting the number of incoming data bytes just once in handle_reply(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 426ca1f2a721..1379b3313348 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1293,6 +1293,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, u64 reassert_version; u32 osdmap_epoch; int already_completed; + u32 bytes; int i; tid = le64_to_cpu(msg->hdr.tid); @@ -1347,9 +1348,10 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, payload_len += len; p += sizeof(*op); } - if (payload_len != le32_to_cpu(msg->hdr.data_len)) { + bytes = le32_to_cpu(msg->hdr.data_len); + if (payload_len != bytes) { pr_warning("sum of op payload lens %d != data_len %d", - payload_len, le32_to_cpu(msg->hdr.data_len)); + payload_len, bytes); goto bad_put; } @@ -1359,10 +1361,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, req->r_reply_op_result[i] = ceph_decode_32(&p); if (!req->r_got_reply) { - unsigned int bytes; req->r_result = result; - bytes = le32_to_cpu(msg->hdr.data_len); dout("handle_reply result %d bytes %d\n", req->r_result, bytes); if (req->r_result == 0) -- cgit v1.2.3 From 43bfe5de9fa78e07248b70992ce50321efec622c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:57 -0500 Subject: libceph: define osd data initialization helpers Define and use functions that encapsulate the initializion of a ceph_osd_data structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 14 ++++------- fs/ceph/addr.c | 13 +++------- fs/ceph/file.c | 10 +++----- include/linux/ceph/osd_client.h | 11 +++++++++ net/ceph/osd_client.c | 55 +++++++++++++++++++++++++++++------------ 5 files changed, 63 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index afbc9f6f8ff1..ab21b5218ae3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1350,17 +1350,13 @@ static struct ceph_osd_request *rbd_osd_req_create( break; /* Nothing to do */ case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); - osd_data->type = CEPH_OSD_DATA_TYPE_BIO; - osd_data->bio = obj_request->bio_list; - osd_data->bio_length = obj_request->length; + ceph_osd_data_bio_init(osd_data, obj_request->bio_list, + obj_request->length); break; case OBJ_REQUEST_PAGES: - osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; - osd_data->pages = obj_request->pages; - osd_data->length = obj_request->length; - osd_data->alignment = offset & ~PAGE_MASK; - osd_data->pages_from_pool = false; - osd_data->own_pages = false; + ceph_osd_data_pages_init(osd_data, obj_request->pages, + obj_request->length, offset & ~PAGE_MASK, + false, false); break; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5d8ce79385ed..cf9032abc8f5 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -342,10 +342,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_data_in.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data_in.pages = pages; - req->r_data_in.length = len; - req->r_data_in.alignment = 0; + ceph_osd_data_pages_init(&req->r_data_in, pages, len, 0, + false, false); req->r_callback = finish_read; req->r_inode = inode; @@ -917,11 +915,8 @@ get_more_pages: dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data_out.pages = pages; - req->r_data_out.length = len; - req->r_data_out.alignment = 0; - req->r_data_out.pages_from_pool = !!pool; + ceph_osd_data_pages_init(&req->r_data_out, pages, len, 0, + !!pool, false); pages = NULL; /* request message now owns the pages array */ pool = NULL; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 47826c2ef511..da642af14a28 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -491,6 +491,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; + bool own_pages = false; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -571,14 +572,11 @@ more: if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */ req->r_safe_callback = sync_write_commit; - req->r_data_out.own_pages = 1; + own_pages = true; } } - req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data_out.pages = pages; - req->r_data_out.length = len; - req->r_data_out.alignment = page_align; - req->r_inode = inode; + ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align, + false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, num_ops, ops, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 5ee1a3776b4b..af60dac1f9c0 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -280,6 +280,17 @@ static inline void ceph_osdc_put_request(struct ceph_osd_request *req) kref_put(&req->r_kref, ceph_osdc_release_request); } +extern void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); +extern void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, + struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK +extern void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, + struct bio *bio, size_t bio_length); +#endif /* CONFIG_BLOCK */ + extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1379b3313348..f8f8561b602e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -79,6 +79,38 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, return 0; } +void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, + struct page **pages, u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->length = length; + osd_data->alignment = alignment; + osd_data->pages_from_pool = pages_from_pool; + osd_data->own_pages = own_pages; +} +EXPORT_SYMBOL(ceph_osd_data_pages_init); + +void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, + struct ceph_pagelist *pagelist) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; + osd_data->pagelist = pagelist; +} +EXPORT_SYMBOL(ceph_osd_data_pagelist_init); + +#ifdef CONFIG_BLOCK +void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, + struct bio *bio, size_t bio_length) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_BIO; + osd_data->bio = bio; + osd_data->bio_length = bio_length; +} +EXPORT_SYMBOL(ceph_osd_data_bio_init); +#endif /* CONFIG_BLOCK */ + /* * requests */ @@ -400,8 +432,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, ceph_pagelist_append(pagelist, src->cls.indata, src->cls.indata_len); - req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGELIST; - req->r_data_out.pagelist = pagelist; + ceph_osd_data_pagelist_init(&req->r_data_out, pagelist); out_data_len = pagelist->length; break; case CEPH_OSD_OP_STARTSYNC: @@ -2056,7 +2087,6 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct page **pages, int num_pages, int page_align) { struct ceph_osd_request *req; - struct ceph_osd_data *osd_data; struct ceph_osd_req_op op; int rc = 0; @@ -2071,14 +2101,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, /* it may be a short read due to an object boundary */ - osd_data = &req->r_data_in; - osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; - osd_data->pages = pages; - osd_data->length = *plen; - osd_data->alignment = page_align; + ceph_osd_data_pages_init(&req->r_data_in, pages, *plen, page_align, + false, false); dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", - off, *plen, osd_data->length, page_align); + off, *plen, *plen, page_align); ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); @@ -2104,7 +2131,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct page **pages, int num_pages) { struct ceph_osd_request *req; - struct ceph_osd_data *osd_data; struct ceph_osd_req_op op; int rc = 0; int page_align = off & ~PAGE_MASK; @@ -2119,12 +2145,9 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ - osd_data = &req->r_data_out; - osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; - osd_data->pages = pages; - osd_data->length = len; - osd_data->alignment = page_align; - dout("writepages %llu~%llu (%llu bytes)\n", off, len, osd_data->length); + ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align, + false, false); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime); -- cgit v1.2.3 From c54d47bfadce7059af0774d80b2b3faaea4afd28 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:57 -0500 Subject: libceph: define a few more helpers Define ceph_osd_data_init() and ceph_osd_data_release() to clean up a little code. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f8f8561b602e..b399e8a18f2b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -79,6 +79,12 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, return 0; } +static void ceph_osd_data_init(struct ceph_osd_data *osd_data) +{ + memset(osd_data, 0, sizeof (*osd_data)); + osd_data->type = CEPH_OSD_DATA_TYPE_NONE; +} + void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) @@ -111,16 +117,28 @@ void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, EXPORT_SYMBOL(ceph_osd_data_bio_init); #endif /* CONFIG_BLOCK */ +static void ceph_osd_data_release(struct ceph_osd_data *osd_data) +{ + if (osd_data->type != CEPH_OSD_DATA_TYPE_PAGES) + return; + + if (osd_data->own_pages) { + int num_pages; + + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + ceph_release_page_vector(osd_data->pages, num_pages); + } +} + /* * requests */ void ceph_osdc_release_request(struct kref *kref) { - int num_pages; - struct ceph_osd_request *req = container_of(kref, - struct ceph_osd_request, - r_kref); + struct ceph_osd_request *req; + req = container_of(kref, struct ceph_osd_request, r_kref); if (req->r_request) ceph_msg_put(req->r_request); if (req->r_reply) { @@ -128,18 +146,8 @@ void ceph_osdc_release_request(struct kref *kref) ceph_msg_put(req->r_reply); } - if (req->r_data_in.type == CEPH_OSD_DATA_TYPE_PAGES && - req->r_data_in.own_pages) { - num_pages = calc_pages_for((u64)req->r_data_in.alignment, - (u64)req->r_data_in.length); - ceph_release_page_vector(req->r_data_in.pages, num_pages); - } - if (req->r_data_out.type == CEPH_OSD_DATA_TYPE_PAGES && - req->r_data_out.own_pages) { - num_pages = calc_pages_for((u64)req->r_data_out.alignment, - (u64)req->r_data_out.length); - ceph_release_page_vector(req->r_data_out.pages, num_pages); - } + ceph_osd_data_release(&req->r_data_in); + ceph_osd_data_release(&req->r_data_out); ceph_put_snap_context(req->r_snapc); if (req->r_mempool) @@ -203,8 +211,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; - req->r_data_in.type = CEPH_OSD_DATA_TYPE_NONE; - req->r_data_out.type = CEPH_OSD_DATA_TYPE_NONE; + ceph_osd_data_init(&req->r_data_in); + ceph_osd_data_init(&req->r_data_out); /* create request message; allow space for oid */ if (use_mempool) -- cgit v1.2.3 From 23c08a9cb2d832cd1d2b7ccdb54d0ab7b8518933 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:58 -0500 Subject: libceph: define ceph_osd_data_length() One more osd data helper, which returns the length of the data item, regardless of its type. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b399e8a18f2b..e197c5c0b3a2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -117,6 +117,25 @@ void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, EXPORT_SYMBOL(ceph_osd_data_bio_init); #endif /* CONFIG_BLOCK */ +static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) +{ + switch (osd_data->type) { + case CEPH_OSD_DATA_TYPE_NONE: + return 0; + case CEPH_OSD_DATA_TYPE_PAGES: + return osd_data->length; + case CEPH_OSD_DATA_TYPE_PAGELIST: + return (u64)osd_data->pagelist->length; +#ifdef CONFIG_BLOCK + case CEPH_OSD_DATA_TYPE_BIO: + return (u64)osd_data->bio_length; +#endif /* CONFIG_BLOCK */ + default: + WARN(true, "unrecognized data type %d\n", (int)osd_data->type); + return 0; + } +} + static void ceph_osd_data_release(struct ceph_osd_data *osd_data) { if (osd_data->type != CEPH_OSD_DATA_TYPE_PAGES) @@ -1887,17 +1906,19 @@ bad: static void ceph_osdc_msg_data_set(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { + u64 length = ceph_osd_data_length(osd_data); + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - BUG_ON(osd_data->length > (u64) SIZE_MAX); - if (osd_data->length) + BUG_ON(length > (u64) SIZE_MAX); + if (length) ceph_msg_data_set_pages(msg, osd_data->pages, - osd_data->length, osd_data->alignment); + length, osd_data->alignment); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { - BUG_ON(!osd_data->pagelist->length); + BUG_ON(!length); ceph_msg_data_set_pagelist(msg, osd_data->pagelist); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - ceph_msg_data_set_bio(msg, osd_data->bio, osd_data->bio_length); + ceph_msg_data_set_bio(msg, osd_data->bio, length); #endif } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); -- cgit v1.2.3 From 79528734f3ae4699a2886f62f55e18fb34fb3651 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 21:32:51 -0500 Subject: libceph: keep source rather than message osd op array An osd request keeps a pointer to the osd operations (ops) array that it builds in its request message. In order to allow each op in the array to have its own distinct data, we will need to keep track of each op's data, and that information does not go over the wire. As long as we're tracking the data we might as well just track the entire (source) op definition for each of the ops. And if we're doing that, we'll have no more need to keep a pointer to the wire-encoded version. This patch makes the array of source ops be kept with the osd request structure, and uses that instead of the version encoded in the message in places where that was previously used. The array will be embedded in the request structure, and the maximum number of ops we ever actually use is currently 2. So reduce CEPH_OSD_MAX_OP to 2 to reduce the size of the structure. The result of doing this sort of ripples back up, and as a result various function parameters and local variables become unnecessary. Make r_num_ops be unsigned, and move the definition of struct ceph_osd_req_op earlier to ensure it's defined where needed. It does not yet add per-op data, that's coming soon. This resolves: http://tracker.ceph.com/issues/4656 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 42 ++++++++++++++----------- fs/ceph/addr.c | 21 ++++++------- fs/ceph/file.c | 6 ++-- include/linux/ceph/osd_client.h | 70 ++++++++++++++++++++--------------------- net/ceph/debugfs.c | 4 +-- net/ceph/osd_client.c | 53 ++++++++++++++++--------------- 6 files changed, 97 insertions(+), 99 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4a4be14a9189..c12b55559f16 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1285,7 +1285,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, */ obj_request->xferred = osd_req->r_reply_op_len[0]; rbd_assert(obj_request->xferred < (u64) UINT_MAX); - opcode = osd_req->r_request_ops[0].op; + opcode = osd_req->r_ops[0].op; switch (opcode) { case CEPH_OSD_OP_READ: rbd_osd_read_callback(obj_request); @@ -1312,8 +1312,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, } static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, - bool write_request, - struct ceph_osd_req_op *op) + bool write_request) { struct rbd_img_request *img_request = obj_request->img_request; struct ceph_snap_context *snapc = NULL; @@ -1333,7 +1332,7 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, } ceph_osdc_build_request(obj_request->osd_req, obj_request->offset, - 1, op, snapc, snap_id, mtime); + snapc, snap_id, mtime); } static struct ceph_osd_request *rbd_osd_req_create( @@ -1562,7 +1561,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, while (resid) { const char *object_name; unsigned int clone_size; - struct ceph_osd_req_op op; + struct ceph_osd_req_op *op; u64 offset; u64 length; @@ -1591,8 +1590,9 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, if (!obj_request->osd_req) goto out_partial; - osd_req_op_extent_init(&op, opcode, offset, length, 0, 0); - rbd_osd_req_format_op(obj_request, write_request, &op); + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_extent_init(op, opcode, offset, length, 0, 0); + rbd_osd_req_format_op(obj_request, write_request); /* status and version are initially zero-filled */ @@ -1694,7 +1694,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { struct rbd_obj_request *obj_request; - struct ceph_osd_req_op op; + struct ceph_osd_req_op *op; struct ceph_osd_client *osdc; int ret; @@ -1708,8 +1708,9 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_req_op_watch_init(&op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); - rbd_osd_req_format_op(obj_request, false, &op); + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_watch_init(op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); + rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; obj_request->callback = rbd_obj_request_put; @@ -1749,7 +1750,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_req_op op; + struct ceph_osd_req_op *op; int ret; rbd_assert(start ^ !!rbd_dev->watch_event); @@ -1773,10 +1774,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request->osd_req) goto out_cancel; - osd_req_op_watch_init(&op, CEPH_OSD_OP_WATCH, + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_watch_init(op, CEPH_OSD_OP_WATCH, rbd_dev->watch_event->cookie, rbd_dev->header.obj_version, start); - rbd_osd_req_format_op(obj_request, true, &op); + rbd_osd_req_format_op(obj_request, true); if (start) ceph_osdc_set_request_linger(osdc, obj_request->osd_req); @@ -1836,7 +1838,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, { struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc; - struct ceph_osd_req_op op; + struct ceph_osd_req_op *op; struct page **pages; u32 page_count; int ret; @@ -1866,9 +1868,10 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_req_op_cls_init(&op, CEPH_OSD_OP_CALL, class_name, method_name, + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_cls_init(op, CEPH_OSD_OP_CALL, class_name, method_name, outbound, outbound_size); - rbd_osd_req_format_op(obj_request, false, &op); + rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); @@ -2046,8 +2049,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, char *buf, u64 *version) { - struct ceph_osd_req_op op; struct rbd_obj_request *obj_request; + struct ceph_osd_req_op *op; struct ceph_osd_client *osdc; struct page **pages = NULL; u32 page_count; @@ -2072,8 +2075,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_req_op_extent_init(&op, CEPH_OSD_OP_READ, offset, length, 0, 0); - rbd_osd_req_format_op(obj_request, false, &op); + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_extent_init(op, CEPH_OSD_OP_READ, offset, length, 0, 0); + rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 127be29a6c22..c9da074f0fe6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -288,7 +288,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) struct page *page = list_entry(page_list->prev, struct page, lru); struct ceph_vino vino; struct ceph_osd_request *req; - struct ceph_osd_req_op op; u64 off; u64 len; int i; @@ -314,7 +313,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) off, len); vino = ceph_vino(inode); req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 1, &op, CEPH_OSD_OP_READ, + 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -349,7 +348,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) req->r_callback = finish_read; req->r_inode = inode; - ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); @@ -567,7 +566,7 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; int rc = req->r_result; - u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); + u64 bytes = req->r_ops[0].extent.length; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); @@ -635,8 +634,7 @@ static void writepages_finish(struct ceph_osd_request *req, static struct ceph_osd_request * ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, - struct ceph_snap_context *snapc, - int num_ops, struct ceph_osd_req_op *ops) + struct ceph_snap_context *snapc, int num_ops) { struct ceph_fs_client *fsc; struct ceph_inode_info *ci; @@ -648,7 +646,7 @@ ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, /* BUG_ON(vino.snap != CEPH_NOSNAP); */ return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, offset, len, num_ops, ops, CEPH_OSD_OP_WRITE, + vino, offset, len, num_ops, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK, snapc, ci->i_truncate_seq, ci->i_truncate_size, true); } @@ -738,7 +736,6 @@ retry: last_snapc = snapc; while (!done && index <= end) { - struct ceph_osd_req_op ops[2]; int num_ops = do_sync ? 2 : 1; struct ceph_vino vino; unsigned i; @@ -846,7 +843,7 @@ get_more_pages: len = wsize; req = ceph_writepages_osd_request(inode, offset, &len, snapc, - num_ops, ops); + num_ops); if (IS_ERR(req)) { rc = PTR_ERR(req); @@ -927,11 +924,11 @@ get_more_pages: /* Update the write op length in case we changed it */ - osd_req_op_extent_update(&ops[0], len); + osd_req_op_extent_update(&req->r_ops[0], len); vino = ceph_vino(inode); - ceph_osdc_build_request(req, offset, num_ops, ops, - snapc, vino.snap, &inode->i_mtime); + ceph_osdc_build_request(req, offset, snapc, vino.snap, + &inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index da642af14a28..a12f47642c40 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -478,7 +478,6 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; - struct ceph_osd_req_op ops[2]; int num_ops = 1; struct page **pages; int num_pages; @@ -534,7 +533,7 @@ more: snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, num_ops, ops, + vino, pos, &len, num_ops, CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -579,8 +578,7 @@ more: false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, num_ops, ops, - snapc, vino.snap, &mtime); + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index af60dac1f9c0..f4c1a2a22a14 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -48,7 +48,7 @@ struct ceph_osd { }; -#define CEPH_OSD_MAX_OP 10 +#define CEPH_OSD_MAX_OP 2 enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE, @@ -79,6 +79,34 @@ struct ceph_osd_data { }; }; +struct ceph_osd_req_op { + u16 op; /* CEPH_OSD_OP_* */ + u32 payload_len; + union { + struct { + u64 offset, length; + u64 truncate_size; + u32 truncate_seq; + } extent; + struct { + const char *class_name; + const char *method_name; + const void *indata; + u32 indata_len; + __u8 class_len; + __u8 method_len; + __u8 argc; + } cls; + struct { + u64 cookie; + u64 ver; + u32 prot_ver; + u32 timeout; + __u8 flag; + } watch; + }; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ @@ -95,10 +123,11 @@ struct ceph_osd_request { struct ceph_msg *r_request, *r_reply; int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ - int r_num_ops; - /* encoded message content */ - struct ceph_osd_op *r_request_ops; + /* request osd ops array */ + unsigned int r_num_ops; + struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; + /* these are updated on each send */ __le32 *r_request_osdmap_epoch; __le32 *r_request_flags; @@ -193,34 +222,6 @@ struct ceph_osd_client { struct workqueue_struct *notify_wq; }; -struct ceph_osd_req_op { - u16 op; /* CEPH_OSD_OP_* */ - u32 payload_len; - union { - struct { - u64 offset, length; - u64 truncate_size; - u32 truncate_seq; - } extent; - struct { - const char *class_name; - const char *method_name; - const void *indata; - u32 indata_len; - __u8 class_len; - __u8 method_len; - __u8 argc; - } cls; - struct { - u64 cookie; - u64 ver; - u32 prot_ver; - u32 timeout; - __u8 flag; - } watch; - }; -}; - extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); @@ -249,8 +250,6 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - unsigned int num_ops, - struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime); @@ -259,8 +258,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, u64 offset, u64 *len, - int num_ops, struct ceph_osd_req_op *ops, - int opcode, int flags, + int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, u32 truncate_seq, u64 truncate_size, bool use_mempool); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 00d051f4894e..83661cdc0766 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -123,8 +123,8 @@ static int osdc_show(struct seq_file *s, void *pp) mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { struct ceph_osd_request *req; + unsigned int i; int opcode; - int i; req = rb_entry(p, struct ceph_osd_request, r_node); @@ -142,7 +142,7 @@ static int osdc_show(struct seq_file *s, void *pp) seq_printf(s, "\t"); for (i = 0; i < req->r_num_ops; i++) { - opcode = le16_to_cpu(req->r_request_ops[i].op); + opcode = req->r_ops[i].op; seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index e197c5c0b3a2..a498d2de17a4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -186,6 +186,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_msg *msg; size_t msg_size; + BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); + BUG_ON(num_ops > CEPH_OSD_MAX_OP); + msg_size = 4 + 4 + 8 + 8 + 4+8; msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pg_t */ @@ -207,6 +210,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req->r_osdc = osdc; req->r_mempool = use_mempool; + req->r_num_ops = num_ops; kref_init(&req->r_kref); init_completion(&req->r_completion); @@ -418,12 +422,14 @@ void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, EXPORT_SYMBOL(osd_req_op_watch_init); static u64 osd_req_encode_op(struct ceph_osd_request *req, - struct ceph_osd_op *dst, - struct ceph_osd_req_op *src) + struct ceph_osd_op *dst, unsigned int which) { + struct ceph_osd_req_op *src; u64 out_data_len = 0; struct ceph_pagelist *pagelist; + BUG_ON(which >= req->r_num_ops); + src = &req->r_ops[which]; if (WARN_ON(!osd_req_opcode_valid(src->op))) { pr_err("unrecognized osd opcode %d\n", src->op); @@ -487,21 +493,17 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, * build new request AND message * */ -void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, unsigned int num_ops, - struct ceph_osd_req_op *src_ops, - struct ceph_snap_context *snapc, u64 snap_id, - struct timespec *mtime) +void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, + struct ceph_snap_context *snapc, u64 snap_id, + struct timespec *mtime) { struct ceph_msg *msg = req->r_request; - struct ceph_osd_req_op *src_op; void *p; size_t msg_size; int flags = req->r_flags; u64 data_len; - int i; + unsigned int i; - req->r_num_ops = num_ops; req->r_snapid = snap_id; req->r_snapc = ceph_get_snap_context(snapc); @@ -541,12 +543,10 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, p += req->r_oid_len; /* ops--can imply data */ - ceph_encode_16(&p, num_ops); - src_op = src_ops; - req->r_request_ops = p; + ceph_encode_16(&p, (u16)req->r_num_ops); data_len = 0; - for (i = 0; i < num_ops; i++, src_op++) { - data_len += osd_req_encode_op(req, p, src_op); + for (i = 0; i < req->r_num_ops; i++) { + data_len += osd_req_encode_op(req, p, i); p += sizeof(struct ceph_osd_op); } @@ -602,7 +602,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, u64 off, u64 *plen, int num_ops, - struct ceph_osd_req_op *ops, int opcode, int flags, struct ceph_snap_context *snapc, u32 truncate_seq, @@ -610,6 +609,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, bool use_mempool) { struct ceph_osd_request *req; + struct ceph_osd_req_op *op; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; @@ -623,6 +623,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); + req->r_flags = flags; /* calculate max write size */ @@ -642,7 +643,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size = object_size; } - osd_req_op_extent_init(&ops[0], opcode, objoff, objlen, + op = &req->r_ops[0]; + osd_req_op_extent_init(op, opcode, objoff, objlen, truncate_size, truncate_seq); /* * A second op in the ops array means the caller wants to @@ -650,7 +652,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, * osd will flush data quickly. */ if (num_ops > 1) - osd_req_op_init(&ops[1], CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(++op, CEPH_OSD_OP_STARTSYNC); req->r_file_layout = *layout; /* keep a copy */ @@ -1342,7 +1344,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, struct ceph_osd_request *req; u64 tid; int object_len; - int numops, payload_len, flags; + unsigned int numops; + int payload_len, flags; s32 result; s32 retry_attempt; struct ceph_pg pg; @@ -1352,7 +1355,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, u32 osdmap_epoch; int already_completed; u32 bytes; - int i; + unsigned int i; tid = le64_to_cpu(msg->hdr.tid); dout("handle_reply %p tid %llu\n", msg, tid); @@ -2116,12 +2119,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct page **pages, int num_pages, int page_align) { struct ceph_osd_request *req; - struct ceph_osd_req_op op; int rc = 0; dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, &op, + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, truncate_seq, truncate_size, false); @@ -2136,7 +2138,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", off, *plen, *plen, page_align); - ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -2160,12 +2162,11 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct page **pages, int num_pages) { struct ceph_osd_request *req; - struct ceph_osd_req_op op; int rc = 0; int page_align = off & ~PAGE_MASK; BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, &op, + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, @@ -2178,7 +2179,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, false, false); dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); - ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime); + ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) -- cgit v1.2.3 From 54d5064912649e296552f298e6472ffd37cd8f90 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:58 -0500 Subject: libceph: rename data out field in osd request op There are fields "indata" and "indata_len" defined the ceph osd request op structure. The "in" part is with from the point of view of the osd server, but is a little confusing here on the client side. Change their names to use "request" instead of "in" to indicate that it defines data provided with the request (as opposed the data returned in the response). Rename the local variable in osd_req_encode_op() to match. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 4 ++-- net/ceph/osd_client.c | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index f4c1a2a22a14..a9c4089894c8 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -91,8 +91,8 @@ struct ceph_osd_req_op { struct { const char *class_name; const char *method_name; - const void *indata; - u32 indata_len; + const void *request_data; + u32 request_data_len; __u8 class_len; __u8 method_len; __u8 argc; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a498d2de17a4..87fcf0b795c0 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -395,9 +395,9 @@ void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, op->cls.method_len = size; payload_len += size; - op->cls.indata = request_data; + op->cls.request_data = request_data; BUG_ON(request_data_size > (size_t) U32_MAX); - op->cls.indata_len = (u32) request_data_size; + op->cls.request_data_len = (u32) request_data_size; payload_len += request_data_size; op->cls.argc = 0; /* currently unused */ @@ -425,7 +425,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, struct ceph_osd_op *dst, unsigned int which) { struct ceph_osd_req_op *src; - u64 out_data_len = 0; + u64 request_data_len = 0; struct ceph_pagelist *pagelist; BUG_ON(which >= req->r_num_ops); @@ -442,7 +442,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: if (src->op == CEPH_OSD_OP_WRITE) - out_data_len = src->extent.length; + request_data_len = src->extent.length; dst->extent.offset = cpu_to_le64(src->extent.offset); dst->extent.length = cpu_to_le64(src->extent.length); dst->extent.truncate_size = @@ -457,16 +457,16 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; - dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); + dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len); ceph_pagelist_append(pagelist, src->cls.class_name, src->cls.class_len); ceph_pagelist_append(pagelist, src->cls.method_name, src->cls.method_len); - ceph_pagelist_append(pagelist, src->cls.indata, - src->cls.indata_len); + ceph_pagelist_append(pagelist, src->cls.request_data, + src->cls.request_data_len); ceph_osd_data_pagelist_init(&req->r_data_out, pagelist); - out_data_len = pagelist->length; + request_data_len = pagelist->length; break; case CEPH_OSD_OP_STARTSYNC: break; @@ -486,7 +486,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->op = cpu_to_le16(src->op); dst->payload_len = cpu_to_le32(src->payload_len); - return out_data_len; + return request_data_len; } /* -- cgit v1.2.3 From 8c042b0df99cd06ef8473ef6e204b87b3dc80158 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:58 -0500 Subject: libceph: add data pointers in osd op structures An extent type osd operation currently implies that there will be corresponding data supplied in the data portion of the request (for write) or response (for read) message. Similarly, an osd class method operation implies a data item will be supplied to receive the response data from the operation. Add a ceph_osd_data pointer to each of those structures, and assign it to point to eithre the incoming or the outgoing data structure in the osd message. The data is not always available when an op is initially set up, so add two new functions to allow setting them after the op has been initialized. Begin to make use of the data item pointer available in the osd operation rather than the request data in or out structure in places where it's convenient. Add some assertions to verify pointers are always set the way they're expected to be. This is a sort of stepping stone toward really moving the data into the osd request ops, to allow for some validation before making that jump. This is the first in a series of patches that resolve: http://tracker.ceph.com/issues/4657 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 24 ++++++++++++++++++++---- fs/ceph/addr.c | 8 +++++--- fs/ceph/file.c | 5 +++-- include/linux/ceph/osd_client.h | 6 ++++++ net/ceph/osd_client.c | 26 +++++++++++++++++++++++++- 5 files changed, 59 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c12b55559f16..eb64ed0f228f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1315,23 +1315,39 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, bool write_request) { struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_osd_request *osd_req = obj_request->osd_req; + struct ceph_osd_data *osd_data = NULL; struct ceph_snap_context *snapc = NULL; u64 snap_id = CEPH_NOSNAP; struct timespec *mtime = NULL; struct timespec now; - rbd_assert(obj_request->osd_req != NULL); + rbd_assert(osd_req != NULL); if (write_request) { + osd_data = &osd_req->r_data_out; now = CURRENT_TIME; mtime = &now; if (img_request) snapc = img_request->snapc; - } else if (img_request) { - snap_id = img_request->snap_id; + } else { + osd_data = &osd_req->r_data_in; + if (img_request) + snap_id = img_request->snap_id; } + if (obj_request->type != OBJ_REQUEST_NODATA) { + struct ceph_osd_req_op *op = &obj_request->osd_req->r_ops[0]; - ceph_osdc_build_request(obj_request->osd_req, obj_request->offset, + /* + * If it has data, it's either a object class method + * call (cls) or it's an extent operation. + */ + if (op->op == CEPH_OSD_OP_CALL) + osd_req_op_cls_response_data(op, osd_data); + else + osd_req_op_extent_osd_data(op, osd_data); + } + ceph_osdc_build_request(osd_req, obj_request->offset, snapc, snap_id, mtime); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c9da074f0fe6..0ac3a37753cb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -343,7 +343,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - ceph_osd_data_pages_init(&req->r_data_in, pages, len, 0, + BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_in); + ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; @@ -916,8 +917,9 @@ get_more_pages: dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - ceph_osd_data_pages_init(&req->r_data_out, pages, len, 0, - !!pool, false); + BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out); + ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, + len, 0, !!pool, false); pages = NULL; /* request message now owns the pages array */ pool = NULL; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a12f47642c40..cddc10fd7cf9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -574,8 +574,9 @@ more: own_pages = true; } } - ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align, - false, own_pages); + BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out); + ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, + page_align, false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index a9c4089894c8..ae5193550fbf 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -87,12 +87,14 @@ struct ceph_osd_req_op { u64 offset, length; u64 truncate_size; u32 truncate_seq; + struct ceph_osd_data *osd_data; } extent; struct { const char *class_name; const char *method_name; const void *request_data; u32 request_data_len; + struct ceph_osd_data *response_data; __u8 class_len; __u8 method_len; __u8 argc; @@ -236,10 +238,14 @@ extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length); +extern void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *osd_data); extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size); +extern void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *response_data); extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, u64 cookie, u64 version, int flag); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 87fcf0b795c0..23491e92b229 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -372,6 +372,13 @@ void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length) } EXPORT_SYMBOL(osd_req_op_extent_update); +void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *osd_data) +{ + op->extent.osd_data = osd_data; +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data); + void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size) @@ -406,6 +413,13 @@ void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, } EXPORT_SYMBOL(osd_req_op_cls_init); +void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *response_data) +{ + op->cls.response_data = response_data; +} +EXPORT_SYMBOL(osd_req_op_cls_response_data); + void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, u64 cookie, u64 version, int flag) { @@ -449,6 +463,10 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); + if (src->op == CEPH_OSD_OP_WRITE) + WARN_ON(src->extent.osd_data != &req->r_data_out); + else + WARN_ON(src->extent.osd_data != &req->r_data_in); break; case CEPH_OSD_OP_CALL: pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); @@ -464,8 +482,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, src->cls.method_len); ceph_pagelist_append(pagelist, src->cls.request_data, src->cls.request_data_len); - ceph_osd_data_pagelist_init(&req->r_data_out, pagelist); + + WARN_ON(src->cls.response_data != &req->r_data_in); request_data_len = pagelist->length; break; case CEPH_OSD_OP_STARTSYNC: @@ -609,6 +628,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, bool use_mempool) { struct ceph_osd_request *req; + struct ceph_osd_data *osd_data; struct ceph_osd_req_op *op; u64 objnum = 0; u64 objoff = 0; @@ -623,6 +643,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); + osd_data = opcode == CEPH_OSD_OP_WRITE ? &req->r_data_out + : &req->r_data_in; req->r_flags = flags; @@ -646,6 +668,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, op = &req->r_ops[0]; osd_req_op_extent_init(op, opcode, objoff, objlen, truncate_size, truncate_seq); + osd_req_op_extent_osd_data(op, osd_data); + /* * A second op in the ops array means the caller wants to * also issue a include a 'startsync' command so that the -- cgit v1.2.3 From c99d2d4abb6c405ef52e9bc1da87b382b8f41739 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:11 -0500 Subject: libceph: specify osd op by index in request An osd request now holds all of its source op structures, and every place that initializes one of these is in fact initializing one of the entries in the the osd request's array. So rather than supplying the address of the op to initialize, have caller specify the osd request and an indication of which op it would like to initialize. This better hides the details the op structure (and faciltates moving the data pointers they use). Since osd_req_op_init() is a common routine, and it's not used outside the osd client code, give it static scope. Also make it return the address of the specified op (so all the other init routines don't have to repeat that code). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 35 ++++++++++------------ fs/ceph/addr.c | 2 +- include/linux/ceph/osd_client.h | 19 +++++++----- net/ceph/osd_client.c | 64 +++++++++++++++++++++++++---------------- 4 files changed, 67 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index eb64ed0f228f..80ac772587c8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1336,16 +1336,17 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, snap_id = img_request->snap_id; } if (obj_request->type != OBJ_REQUEST_NODATA) { - struct ceph_osd_req_op *op = &obj_request->osd_req->r_ops[0]; - /* * If it has data, it's either a object class method * call (cls) or it's an extent operation. */ - if (op->op == CEPH_OSD_OP_CALL) - osd_req_op_cls_response_data(op, osd_data); + /* XXX This use of the ops array goes away in the next patch */ + if (obj_request->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL) + osd_req_op_cls_response_data(obj_request->osd_req, 0, + osd_data); else - osd_req_op_extent_osd_data(op, osd_data); + osd_req_op_extent_osd_data(obj_request->osd_req, 0, + osd_data); } ceph_osdc_build_request(osd_req, obj_request->offset, snapc, snap_id, mtime); @@ -1577,7 +1578,6 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, while (resid) { const char *object_name; unsigned int clone_size; - struct ceph_osd_req_op *op; u64 offset; u64 length; @@ -1606,8 +1606,8 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, if (!obj_request->osd_req) goto out_partial; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_extent_init(op, opcode, offset, length, 0, 0); + osd_req_op_extent_init(obj_request->osd_req, 0, + opcode, offset, length, 0, 0); rbd_osd_req_format_op(obj_request, write_request); /* status and version are initially zero-filled */ @@ -1710,7 +1710,6 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; struct ceph_osd_client *osdc; int ret; @@ -1724,8 +1723,8 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_watch_init(op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, + notify_id, ver, 0); rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; @@ -1766,7 +1765,6 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; int ret; rbd_assert(start ^ !!rbd_dev->watch_event); @@ -1790,8 +1788,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request->osd_req) goto out_cancel; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_watch_init(op, CEPH_OSD_OP_WATCH, + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, rbd_dev->watch_event->cookie, rbd_dev->header.obj_version, start); rbd_osd_req_format_op(obj_request, true); @@ -1854,7 +1851,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, { struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc; - struct ceph_osd_req_op *op; struct page **pages; u32 page_count; int ret; @@ -1884,8 +1880,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_cls_init(op, CEPH_OSD_OP_CALL, class_name, method_name, + osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, + class_name, method_name, outbound, outbound_size); rbd_osd_req_format_op(obj_request, false); @@ -2066,7 +2062,6 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, { struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; struct ceph_osd_client *osdc; struct page **pages = NULL; u32 page_count; @@ -2091,8 +2086,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_extent_init(op, CEPH_OSD_OP_READ, offset, length, 0, 0); + osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, + offset, length, 0, 0); rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 0ac3a37753cb..cc57104a7266 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -926,7 +926,7 @@ get_more_pages: /* Update the write op length in case we changed it */ - osd_req_op_extent_update(&req->r_ops[0], len); + osd_req_op_extent_update(req, 0, len); vino = ceph_vino(inode); ceph_osdc_build_request(req, offset, snapc, vino.snap, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ae5193550fbf..144d57cbef9e 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -233,20 +233,25 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode); -extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, +extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq); -extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length); -extern void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, +extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length); +extern void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *osd_data); -extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, +extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size); -extern void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, +extern void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *response_data); -extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, +extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 23491e92b229..ad24f210bf0c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -329,25 +329,32 @@ static bool osd_req_opcode_valid(u16 opcode) * other information associated with them. It also serves as a * common init routine for all the other init functions, below. */ -void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode) +static struct ceph_osd_req_op * +osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode) { + struct ceph_osd_req_op *op; + + BUG_ON(which >= osd_req->r_num_ops); BUG_ON(!osd_req_opcode_valid(opcode)); + op = &osd_req->r_ops[which]; memset(op, 0, sizeof (*op)); - op->op = opcode; + + return op; } -void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, +void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); - osd_req_op_init(op, opcode); - op->extent.offset = offset; op->extent.length = length; op->extent.truncate_size = truncate_size; @@ -359,9 +366,15 @@ void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, } EXPORT_SYMBOL(osd_req_op_extent_init); -void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length) +void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length) { - u64 previous = op->extent.length; + struct ceph_osd_req_op *op; + u64 previous; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; + previous = op->extent.length; if (length == previous) return; /* Nothing to do */ @@ -372,24 +385,25 @@ void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length) } EXPORT_SYMBOL(osd_req_op_extent_update); -void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, +void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *osd_data) { - op->extent.osd_data = osd_data; + BUG_ON(which >= osd_req->r_num_ops); + osd_req->r_ops[which].extent.osd_data = osd_data; } EXPORT_SYMBOL(osd_req_op_extent_osd_data); -void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, - const char *class, const char *method, +void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size) { + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); size_t payload_len = 0; size_t size; BUG_ON(opcode != CEPH_OSD_OP_CALL); - osd_req_op_init(op, opcode); - op->cls.class_name = class; size = strlen(class); BUG_ON(size > (size_t) U8_MAX); @@ -412,26 +426,28 @@ void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, op->payload_len = payload_len; } EXPORT_SYMBOL(osd_req_op_cls_init); - -void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, +void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *response_data) { - op->cls.response_data = response_data; + BUG_ON(which >= osd_req->r_num_ops); + osd_req->r_ops[which].cls.response_data = response_data; } EXPORT_SYMBOL(osd_req_op_cls_response_data); -void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, +void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 cookie, u64 version, int flag) { - BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); - osd_req_op_init(op, opcode); + BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); op->watch.cookie = cookie; /* op->watch.ver = version; */ /* XXX 3847 */ op->watch.ver = cpu_to_le64(version); if (opcode == CEPH_OSD_OP_WATCH && flag) - op->watch.flag = (u8) 1; + op->watch.flag = (u8)1; } EXPORT_SYMBOL(osd_req_op_watch_init); @@ -629,7 +645,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, { struct ceph_osd_request *req; struct ceph_osd_data *osd_data; - struct ceph_osd_req_op *op; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; @@ -665,10 +680,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size = object_size; } - op = &req->r_ops[0]; - osd_req_op_extent_init(op, opcode, objoff, objlen, + osd_req_op_extent_init(req, 0, opcode, objoff, objlen, truncate_size, truncate_seq); - osd_req_op_extent_osd_data(op, osd_data); + osd_req_op_extent_osd_data(req, 0, osd_data); /* * A second op in the ops array means the caller wants to @@ -676,7 +690,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, * osd will flush data quickly. */ if (num_ops > 1) - osd_req_op_init(++op, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); req->r_file_layout = *layout; /* keep a copy */ -- cgit v1.2.3 From 5f562df5f59340eae4272501b974903f48d2ad92 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: format class info at init time An object class method is formatted using a pagelist which contains the class name, the method name, and the data concatenated into an osd request's outbound data. Currently when a class op is initialized in osd_req_op_cls_init(), the lengths of and pointers to these three items are recorded. Later, when the op is getting formatted into the request message, a new pagelist is created and that is when these items get copied into the pagelist. This patch makes it so the pagelist to hold these items is created when the op is initialized instead. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 3 ++- net/ceph/osd_client.c | 29 +++++++++++++++-------------- 2 files changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 144d57cbef9e..71c41575646d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -93,8 +93,9 @@ struct ceph_osd_req_op { const char *class_name; const char *method_name; const void *request_data; - u32 request_data_len; + struct ceph_osd_data *request_info; struct ceph_osd_data *response_data; + u32 request_data_len; __u8 class_len; __u8 method_len; __u8 argc; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ad24f210bf0c..db2624860384 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -399,28 +399,39 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, const void *request_data, size_t request_data_size) { struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); + struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; BUG_ON(opcode != CEPH_OSD_OP_CALL); + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + BUG_ON(!pagelist); + ceph_pagelist_init(pagelist); + op->cls.class_name = class; size = strlen(class); BUG_ON(size > (size_t) U8_MAX); op->cls.class_len = size; + ceph_pagelist_append(pagelist, class, size); payload_len += size; op->cls.method_name = method; size = strlen(method); BUG_ON(size > (size_t) U8_MAX); op->cls.method_len = size; + ceph_pagelist_append(pagelist, method, size); payload_len += size; op->cls.request_data = request_data; BUG_ON(request_data_size > (size_t) U32_MAX); op->cls.request_data_len = (u32) request_data_size; + ceph_pagelist_append(pagelist, request_data, request_data_size); payload_len += request_data_size; + op->cls.request_info = &osd_req->r_data_out; + ceph_osd_data_pagelist_init(op->cls.request_info, pagelist); + op->cls.argc = 0; /* currently unused */ op->payload_len = payload_len; @@ -456,7 +467,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, { struct ceph_osd_req_op *src; u64 request_data_len = 0; - struct ceph_pagelist *pagelist; BUG_ON(which >= req->r_num_ops); src = &req->r_ops[which]; @@ -485,23 +495,14 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, WARN_ON(src->extent.osd_data != &req->r_data_in); break; case CEPH_OSD_OP_CALL: - pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); - BUG_ON(!pagelist); - ceph_pagelist_init(pagelist); - dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len); - ceph_pagelist_append(pagelist, src->cls.class_name, - src->cls.class_len); - ceph_pagelist_append(pagelist, src->cls.method_name, - src->cls.method_len); - ceph_pagelist_append(pagelist, src->cls.request_data, - src->cls.request_data_len); - ceph_osd_data_pagelist_init(&req->r_data_out, pagelist); - WARN_ON(src->cls.response_data != &req->r_data_in); - request_data_len = pagelist->length; + WARN_ON(src->cls.request_info != &req->r_data_out); + BUG_ON(src->cls.request_info->type != + CEPH_OSD_DATA_TYPE_PAGELIST); + request_data_len = src->cls.request_info->pagelist->length; break; case CEPH_OSD_OP_STARTSYNC: break; -- cgit v1.2.3 From e65550fd94c5c01b438e24fbf4a29ba65709ec97 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: move ceph_osdc_build_request() This simply moves ceph_osdc_build_request() later in its source file without any change. Done as a separate patch to facilitate review of the change in the next patch. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 196 +++++++++++++++++++++++++------------------------- 1 file changed, 98 insertions(+), 98 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index db2624860384..3fe8a7909ed9 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -525,104 +525,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, return request_data_len; } -/* - * build new request AND message - * - */ -void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - struct ceph_snap_context *snapc, u64 snap_id, - struct timespec *mtime) -{ - struct ceph_msg *msg = req->r_request; - void *p; - size_t msg_size; - int flags = req->r_flags; - u64 data_len; - unsigned int i; - - req->r_snapid = snap_id; - req->r_snapc = ceph_get_snap_context(snapc); - - /* encode request */ - msg->hdr.version = cpu_to_le16(4); - - p = msg->front.iov_base; - ceph_encode_32(&p, 1); /* client_inc is always 1 */ - req->r_request_osdmap_epoch = p; - p += 4; - req->r_request_flags = p; - p += 4; - if (req->r_flags & CEPH_OSD_FLAG_WRITE) - ceph_encode_timespec(p, mtime); - p += sizeof(struct ceph_timespec); - req->r_request_reassert_version = p; - p += sizeof(struct ceph_eversion); /* will get filled in */ - - /* oloc */ - ceph_encode_8(&p, 4); - ceph_encode_8(&p, 4); - ceph_encode_32(&p, 8 + 4 + 4); - req->r_request_pool = p; - p += 8; - ceph_encode_32(&p, -1); /* preferred */ - ceph_encode_32(&p, 0); /* key len */ - - ceph_encode_8(&p, 1); - req->r_request_pgid = p; - p += 8 + 4; - ceph_encode_32(&p, -1); /* preferred */ - - /* oid */ - ceph_encode_32(&p, req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); - p += req->r_oid_len; - - /* ops--can imply data */ - ceph_encode_16(&p, (u16)req->r_num_ops); - data_len = 0; - for (i = 0; i < req->r_num_ops; i++) { - data_len += osd_req_encode_op(req, p, i); - p += sizeof(struct ceph_osd_op); - } - - /* snaps */ - ceph_encode_64(&p, req->r_snapid); - ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); - ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); - if (req->r_snapc) { - for (i = 0; i < snapc->num_snaps; i++) { - ceph_encode_64(&p, req->r_snapc->snaps[i]); - } - } - - req->r_request_attempts = p; - p += 4; - - /* data */ - if (flags & CEPH_OSD_FLAG_WRITE) { - u16 data_off; - - /* - * The header "data_off" is a hint to the receiver - * allowing it to align received data into its - * buffers such that there's no need to re-copy - * it before writing it to disk (direct I/O). - */ - data_off = (u16) (off & 0xffff); - req->r_request->hdr.data_off = cpu_to_le16(data_off); - } - req->r_request->hdr.data_len = cpu_to_le32(data_len); - - BUG_ON(p > msg->front.iov_base + msg->front.iov_len); - msg_size = p - msg->front.iov_base; - msg->front.iov_len = msg_size; - msg->hdr.front_len = cpu_to_le32(msg_size); - - dout("build_request msg_size was %d\n", (int)msg_size); -} -EXPORT_SYMBOL(ceph_osdc_build_request); - /* * build new request AND message, calculate layout, and adjust file * extent as needed. @@ -1967,6 +1869,104 @@ static void ceph_osdc_msg_data_set(struct ceph_msg *msg, } } +/* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, + struct ceph_snap_context *snapc, u64 snap_id, + struct timespec *mtime) +{ + struct ceph_msg *msg = req->r_request; + void *p; + size_t msg_size; + int flags = req->r_flags; + u64 data_len; + unsigned int i; + + req->r_snapid = snap_id; + req->r_snapc = ceph_get_snap_context(snapc); + + /* encode request */ + msg->hdr.version = cpu_to_le16(4); + + p = msg->front.iov_base; + ceph_encode_32(&p, 1); /* client_inc is always 1 */ + req->r_request_osdmap_epoch = p; + p += 4; + req->r_request_flags = p; + p += 4; + if (req->r_flags & CEPH_OSD_FLAG_WRITE) + ceph_encode_timespec(p, mtime); + p += sizeof(struct ceph_timespec); + req->r_request_reassert_version = p; + p += sizeof(struct ceph_eversion); /* will get filled in */ + + /* oloc */ + ceph_encode_8(&p, 4); + ceph_encode_8(&p, 4); + ceph_encode_32(&p, 8 + 4 + 4); + req->r_request_pool = p; + p += 8; + ceph_encode_32(&p, -1); /* preferred */ + ceph_encode_32(&p, 0); /* key len */ + + ceph_encode_8(&p, 1); + req->r_request_pgid = p; + p += 8 + 4; + ceph_encode_32(&p, -1); /* preferred */ + + /* oid */ + ceph_encode_32(&p, req->r_oid_len); + memcpy(p, req->r_oid, req->r_oid_len); + dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); + p += req->r_oid_len; + + /* ops--can imply data */ + ceph_encode_16(&p, (u16)req->r_num_ops); + data_len = 0; + for (i = 0; i < req->r_num_ops; i++) { + data_len += osd_req_encode_op(req, p, i); + p += sizeof(struct ceph_osd_op); + } + + /* snaps */ + ceph_encode_64(&p, req->r_snapid); + ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); + ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); + if (req->r_snapc) { + for (i = 0; i < snapc->num_snaps; i++) { + ceph_encode_64(&p, req->r_snapc->snaps[i]); + } + } + + req->r_request_attempts = p; + p += 4; + + /* data */ + if (flags & CEPH_OSD_FLAG_WRITE) { + u16 data_off; + + /* + * The header "data_off" is a hint to the receiver + * allowing it to align received data into its + * buffers such that there's no need to re-copy + * it before writing it to disk (direct I/O). + */ + data_off = (u16) (off & 0xffff); + req->r_request->hdr.data_off = cpu_to_le16(data_off); + } + req->r_request->hdr.data_len = cpu_to_le32(data_len); + + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); + msg_size = p - msg->front.iov_base; + msg->front.iov_len = msg_size; + msg->hdr.front_len = cpu_to_le32(msg_size); + + dout("build_request msg_size was %d\n", (int)msg_size); +} +EXPORT_SYMBOL(ceph_osdc_build_request); + /* * Register request, send initial attempt. */ -- cgit v1.2.3 From 39b44cbe86db42e70693787b2ede81c309925d0b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: set message data when building osd request All calls of ceph_osdc_start_request() are preceded (in the case of rbd, almost) immediately by a call to ceph_osdc_build_request(). Move the build calls at the top of ceph_osdc_start_request() out of there and into the ceph_osdc_build_request(). Nothing prevents moving these calls to the top of ceph_osdc_build_request(), either (and we're going to want them there in the next patch) so put them at the top. This and the next patch are related to: http://tracker.ceph.com/issues/4657 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3fe8a7909ed9..932b8af8b8ee 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1884,6 +1884,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 data_len; unsigned int i; + /* Set up response incoming data and request outgoing data fields */ + + ceph_osdc_msg_data_set(req->r_reply, &req->r_data_in); + ceph_osdc_msg_data_set(req->r_request, &req->r_data_out); + req->r_snapid = snap_id; req->r_snapc = ceph_get_snap_context(snapc); @@ -1976,11 +1981,6 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, { int rc = 0; - /* Set up response incoming data and request outgoing data fields */ - - ceph_osdc_msg_data_set(req->r_reply, &req->r_data_in); - ceph_osdc_msg_data_set(req->r_request, &req->r_data_out); - down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); __register_request(osdc, req); -- cgit v1.2.3 From a4ce40a9a7c1053ac2a41cf64255e44e356e5522 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: combine initializing and setting osd data This ends up being a rather large patch but what it's doing is somewhat straightforward. Basically, this is replacing two calls with one. The first of the two calls is initializing a struct ceph_osd_data with data (either a page array, a page list, or a bio list); the second is setting an osd request op so it associates that data with one of the op's parameters. In place of those two will be a single function that initializes the op directly. That means we sort of fan out a set of the needed functions: - extent ops with pages data - extent ops with pagelist data - extent ops with bio list data and - class ops with page data for receiving a response We also have define another one, but it's only used internally: - class ops with pagelist data for request parameters Note that we *still* haven't gotten rid of the osd request's r_data_in and r_data_out fields. All the osd ops refer to them for their data. For now, these data fields are pointers assigned to the appropriate r_data_* field when these new functions are called. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 20 ++---- fs/ceph/addr.c | 12 ++-- fs/ceph/file.c | 3 +- include/linux/ceph/osd_client.h | 43 ++++++----- net/ceph/osd_client.c | 155 +++++++++++++++++++++++++++++++--------- 5 files changed, 161 insertions(+), 72 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index db29783436c8..6f7a52cf75c7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1592,7 +1592,6 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, rbd_assert(resid > 0); while (resid) { struct ceph_osd_request *osd_req; - struct ceph_osd_data *osd_data; const char *object_name; unsigned int clone_size; u64 offset; @@ -1625,13 +1624,10 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; - osd_data = write_request ? &osd_req->r_data_out - : &osd_req->r_data_in; osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 0, 0); - ceph_osd_data_bio_init(osd_data, obj_request->bio_list, - obj_request->length); - osd_req_op_extent_osd_data(osd_req, 0, osd_data); + osd_req_op_extent_osd_data_bio(osd_req, 0, write_request, + obj_request->bio_list, obj_request->length); rbd_osd_req_format(obj_request, write_request); rbd_img_obj_request_add(img_request, obj_request); @@ -1821,7 +1817,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_data *osd_data; struct page **pages; u32 page_count; int ret; @@ -1851,13 +1846,12 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_data = &obj_request->osd_req->r_data_in; osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, class_name, method_name, outbound, outbound_size); - ceph_osd_data_pages_init(osd_data, obj_request->pages, inbound_size, + osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, + obj_request->pages, inbound_size, 0, false, false); - osd_req_op_cls_response_data(obj_request->osd_req, 0, osd_data); rbd_osd_req_format(obj_request, false); ret = rbd_obj_request_submit(osdc, obj_request); @@ -2037,7 +2031,6 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_data *osd_data; struct page **pages = NULL; u32 page_count; size_t size; @@ -2061,14 +2054,13 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_data = &obj_request->osd_req->r_data_in; osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, offset, length, 0, 0); - ceph_osd_data_pages_init(osd_data, obj_request->pages, + osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false, + obj_request->pages, obj_request->length, obj_request->offset & ~PAGE_MASK, false, false); - osd_req_op_extent_osd_data(obj_request->osd_req, 0, osd_data); rbd_osd_req_format(obj_request, false); ret = rbd_obj_request_submit(osdc, obj_request); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index cc57104a7266..27d62070a8e9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -245,7 +245,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - osd_data = &req->r_data_in; + osd_data = osd_req_op_extent_osd_data(req, 0, false); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); @@ -343,8 +343,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_in); - ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, 0, + osd_req_op_extent_osd_data_pages(req, 0, false, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; @@ -572,7 +571,7 @@ static void writepages_finish(struct ceph_osd_request *req, long writeback_stat; unsigned issued = ceph_caps_issued(ci); - osd_data = &req->r_data_out; + osd_data = osd_req_op_extent_osd_data(req, 0, true); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); @@ -917,9 +916,8 @@ get_more_pages: dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out); - ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, - len, 0, !!pool, false); + osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, 0, + !!pool, false); pages = NULL; /* request message now owns the pages array */ pool = NULL; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index cddc10fd7cf9..0f9c4095614b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -574,8 +574,7 @@ more: own_pages = true; } } - BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out); - ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, + osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, page_align, false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 71c41575646d..f8a00b48e550 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -240,17 +240,39 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, unsigned int which, u64 length); -extern void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + +extern struct ceph_osd_data *osd_req_op_extent_osd_data( + struct ceph_osd_request *osd_req, + unsigned int which, bool write_request); +extern struct ceph_osd_data *osd_req_op_cls_response_data( + struct ceph_osd_request *osd_req, + unsigned int which); + +extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, + unsigned int which, bool write_request, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); +extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, + unsigned int which, bool write_request, + struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK +extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, + unsigned int which, bool write_request, + struct bio *bio, size_t bio_length); +#endif /* CONFIG_BLOCK */ + +extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, - struct ceph_osd_data *osd_data); + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); + extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size); -extern void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, - unsigned int which, - struct ceph_osd_data *response_data); extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); @@ -290,17 +312,6 @@ static inline void ceph_osdc_put_request(struct ceph_osd_request *req) kref_put(&req->r_kref, ceph_osdc_release_request); } -extern void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, - struct page **pages, u64 length, - u32 alignment, bool pages_from_pool, - bool own_pages); -extern void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, - struct ceph_pagelist *pagelist); -#ifdef CONFIG_BLOCK -extern void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, - struct bio *bio, size_t bio_length); -#endif /* CONFIG_BLOCK */ - extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 932b8af8b8ee..86cb52404f17 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1,3 +1,4 @@ + #include #include @@ -85,7 +86,7 @@ static void ceph_osd_data_init(struct ceph_osd_data *osd_data) osd_data->type = CEPH_OSD_DATA_TYPE_NONE; } -void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, +static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { @@ -96,27 +97,131 @@ void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, osd_data->pages_from_pool = pages_from_pool; osd_data->own_pages = own_pages; } -EXPORT_SYMBOL(ceph_osd_data_pages_init); -void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, +static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, struct ceph_pagelist *pagelist) { osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; osd_data->pagelist = pagelist; } -EXPORT_SYMBOL(ceph_osd_data_pagelist_init); #ifdef CONFIG_BLOCK -void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, +static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, struct bio *bio, size_t bio_length) { osd_data->type = CEPH_OSD_DATA_TYPE_BIO; osd_data->bio = bio; osd_data->bio_length = bio_length; } -EXPORT_SYMBOL(ceph_osd_data_bio_init); #endif /* CONFIG_BLOCK */ +struct ceph_osd_data * +osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which, bool write_request) +{ + BUG_ON(which >= osd_req->r_num_ops); + + /* return &osd_req->r_ops[which].extent.osd_data; */ + return write_request ? &osd_req->r_data_out : &osd_req->r_data_in; +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data); + +struct ceph_osd_data * +osd_req_op_cls_request_info(struct ceph_osd_request *osd_req, + unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + /* return &osd_req->r_ops[which].cls.request_info; */ + return &osd_req->r_data_out; /* Request data is outgoing */ +} +EXPORT_SYMBOL(osd_req_op_cls_request_info); /* ??? */ + +struct ceph_osd_data * +osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + /* return &osd_req->r_ops[which].cls.response_data; */ + return &osd_req->r_data_in; /* Response data is incoming */ +} +EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ + +void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, bool write_request, + struct page **pages, u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); + + osd_req->r_ops[which].extent.osd_data = + osd_req_op_extent_osd_data(osd_req, which, write_request); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); + +void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, + unsigned int which, bool write_request, + struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + ceph_osd_data_pagelist_init(osd_data, pagelist); + + osd_req->r_ops[which].extent.osd_data = + osd_req_op_extent_osd_data(osd_req, which, write_request); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); + +#ifdef CONFIG_BLOCK +void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, + unsigned int which, bool write_request, + struct bio *bio, size_t bio_length) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + ceph_osd_data_bio_init(osd_data, bio, bio_length); + + osd_req->r_ops[which].extent.osd_data = + osd_req_op_extent_osd_data(osd_req, which, write_request); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); +#endif /* CONFIG_BLOCK */ + +static void osd_req_op_cls_request_info_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_cls_request_info(osd_req, which); + ceph_osd_data_pagelist_init(osd_data, pagelist); + + osd_req->r_ops[which].cls.request_info = + osd_req_op_cls_request_info(osd_req, which); +} + +void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_cls_response_data(osd_req, which); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); + + osd_req->r_ops[which].cls.response_data = + osd_req_op_cls_response_data(osd_req, which); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); + static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) { switch (osd_data->type) { @@ -385,15 +490,6 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_update); -void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, - unsigned int which, - struct ceph_osd_data *osd_data) -{ - BUG_ON(which >= osd_req->r_num_ops); - osd_req->r_ops[which].extent.osd_data = osd_data; -} -EXPORT_SYMBOL(osd_req_op_extent_osd_data); - void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size) @@ -429,22 +525,13 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_pagelist_append(pagelist, request_data, request_data_size); payload_len += request_data_size; - op->cls.request_info = &osd_req->r_data_out; - ceph_osd_data_pagelist_init(op->cls.request_info, pagelist); + osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); op->cls.argc = 0; /* currently unused */ op->payload_len = payload_len; } EXPORT_SYMBOL(osd_req_op_cls_init); -void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, - unsigned int which, - struct ceph_osd_data *response_data) -{ - BUG_ON(which >= osd_req->r_num_ops); - osd_req->r_ops[which].cls.response_data = response_data; -} -EXPORT_SYMBOL(osd_req_op_cls_response_data); void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, @@ -547,7 +634,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, bool use_mempool) { struct ceph_osd_request *req; - struct ceph_osd_data *osd_data; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; @@ -561,8 +647,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); - osd_data = opcode == CEPH_OSD_OP_WRITE ? &req->r_data_out - : &req->r_data_in; req->r_flags = flags; @@ -585,7 +669,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, osd_req_op_extent_init(req, 0, opcode, objoff, objlen, truncate_size, truncate_seq); - osd_req_op_extent_osd_data(req, 0, osd_data); /* * A second op in the ops array means the caller wants to @@ -2171,8 +2254,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, /* it may be a short read due to an object boundary */ - ceph_osd_data_pages_init(&req->r_data_in, pages, *plen, page_align, - false, false); + osd_req_op_extent_osd_data_pages(req, 0, false, + pages, *plen, page_align, false, false); dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", off, *plen, *plen, page_align); @@ -2214,7 +2297,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ - ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align, + osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, page_align, false, false); dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); @@ -2308,8 +2391,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - struct ceph_osd_data *osd_data = &req->r_data_in; + struct ceph_osd_data *osd_data; + /* + * XXX This is assuming there is only one op containing + * XXX page data. Probably OK for reads, but this + * XXX ought to be done more generally. + */ + osd_data = osd_req_op_extent_osd_data(req, 0, false); if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { if (osd_data->pages && unlikely(osd_data->length < data_len)) { -- cgit v1.2.3 From ec9123c56787fa7fb2608f05b19d21c5e1912d87 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: set the data pointers when encoding ops Still using the osd request r_data_in and r_data_out pointer, but we're basically only referring to it via the data pointers in the osd ops. And we're transferring that information to the request or reply message only when the op indicates it's needed, in osd_req_encode_op(). To avoid a forward reference, ceph_osdc_msg_data_set() was moved up in the file. Don't bother calling ceph_osd_data_init(), in ceph_osd_alloc(), because the ops array will already be zeroed anyway. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 63 ++++++++++++++++++++--------------------- 2 files changed, 32 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index f8a00b48e550..dd4ca4ba8cab 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -51,7 +51,7 @@ struct ceph_osd { #define CEPH_OSD_MAX_OP 2 enum ceph_osd_data_type { - CEPH_OSD_DATA_TYPE_NONE, + CEPH_OSD_DATA_TYPE_NONE = 0, CEPH_OSD_DATA_TYPE_PAGES, CEPH_OSD_DATA_TYPE_PAGELIST, #ifdef CONFIG_BLOCK diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 86cb52404f17..cc4003fdc01f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -339,9 +339,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; - ceph_osd_data_init(&req->r_data_in); - ceph_osd_data_init(&req->r_data_out); - /* create request message; allow space for oid */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); @@ -549,6 +546,28 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_watch_init); +static void ceph_osdc_msg_data_set(struct ceph_msg *msg, + struct ceph_osd_data *osd_data) +{ + u64 length = ceph_osd_data_length(osd_data); + + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + BUG_ON(length > (u64) SIZE_MAX); + if (length) + ceph_msg_data_set_pages(msg, osd_data->pages, + length, osd_data->alignment); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { + BUG_ON(!length); + ceph_msg_data_set_pagelist(msg, osd_data->pagelist); +#ifdef CONFIG_BLOCK + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { + ceph_msg_data_set_bio(msg, osd_data->bio, length); +#endif + } else { + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); + } +} + static u64 osd_req_encode_op(struct ceph_osd_request *req, struct ceph_osd_op *dst, unsigned int which) { @@ -576,17 +595,24 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); - if (src->op == CEPH_OSD_OP_WRITE) + if (src->op == CEPH_OSD_OP_WRITE) { WARN_ON(src->extent.osd_data != &req->r_data_out); - else + ceph_osdc_msg_data_set(req->r_request, + src->extent.osd_data); + } else { WARN_ON(src->extent.osd_data != &req->r_data_in); + ceph_osdc_msg_data_set(req->r_reply, + src->extent.osd_data); + } break; case CEPH_OSD_OP_CALL: dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len); WARN_ON(src->cls.response_data != &req->r_data_in); + ceph_osdc_msg_data_set(req->r_reply, src->cls.response_data); WARN_ON(src->cls.request_info != &req->r_data_out); + ceph_osdc_msg_data_set(req->r_request, src->cls.request_info); BUG_ON(src->cls.request_info->type != CEPH_OSD_DATA_TYPE_PAGELIST); request_data_len = src->cls.request_info->pagelist->length; @@ -1930,28 +1956,6 @@ bad: return; } -static void ceph_osdc_msg_data_set(struct ceph_msg *msg, - struct ceph_osd_data *osd_data) -{ - u64 length = ceph_osd_data_length(osd_data); - - if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - BUG_ON(length > (u64) SIZE_MAX); - if (length) - ceph_msg_data_set_pages(msg, osd_data->pages, - length, osd_data->alignment); - } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { - BUG_ON(!length); - ceph_msg_data_set_pagelist(msg, osd_data->pagelist); -#ifdef CONFIG_BLOCK - } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - ceph_msg_data_set_bio(msg, osd_data->bio, length); -#endif - } else { - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); - } -} - /* * build new request AND message * @@ -1967,11 +1971,6 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 data_len; unsigned int i; - /* Set up response incoming data and request outgoing data fields */ - - ceph_osdc_msg_data_set(req->r_reply, &req->r_data_in); - ceph_osdc_msg_data_set(req->r_request, &req->r_data_out); - req->r_snapid = snap_id; req->r_snapc = ceph_get_snap_context(snapc); -- cgit v1.2.3 From 5476492fba9fd0b4118aacf5b924dd29b8cca56c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: kill off osd request r_data_in and r_data_out Finally! Convert the osd op data pointers into real structures, and make the switch over to using them instead of having all ops share the in and/or out data structures in the osd request. Set up a new function to traverse the set of ops and release any data associated with them (pages). This and the patches leading up to it resolve: http://tracker.ceph.com/issues/4657 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 9 ++--- net/ceph/osd_client.c | 79 ++++++++++++++++++++--------------------- 2 files changed, 42 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index dd4ca4ba8cab..4ec46c0ceaf7 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -87,14 +87,14 @@ struct ceph_osd_req_op { u64 offset, length; u64 truncate_size; u32 truncate_seq; - struct ceph_osd_data *osd_data; + struct ceph_osd_data osd_data; } extent; struct { const char *class_name; const char *method_name; const void *request_data; - struct ceph_osd_data *request_info; - struct ceph_osd_data *response_data; + struct ceph_osd_data request_info; + struct ceph_osd_data response_data; u32 request_data_len; __u8 class_len; __u8 method_len; @@ -164,9 +164,6 @@ struct ceph_osd_request { struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ - - struct ceph_osd_data r_data_in; - struct ceph_osd_data r_data_out; }; struct ceph_osd_event { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cc4003fdc01f..2562e4e52245 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -121,8 +121,7 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, { BUG_ON(which >= osd_req->r_num_ops); - /* return &osd_req->r_ops[which].extent.osd_data; */ - return write_request ? &osd_req->r_data_out : &osd_req->r_data_in; + return &osd_req->r_ops[which].extent.osd_data; } EXPORT_SYMBOL(osd_req_op_extent_osd_data); @@ -132,8 +131,7 @@ osd_req_op_cls_request_info(struct ceph_osd_request *osd_req, { BUG_ON(which >= osd_req->r_num_ops); - /* return &osd_req->r_ops[which].cls.request_info; */ - return &osd_req->r_data_out; /* Request data is outgoing */ + return &osd_req->r_ops[which].cls.request_info; } EXPORT_SYMBOL(osd_req_op_cls_request_info); /* ??? */ @@ -143,8 +141,7 @@ osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, { BUG_ON(which >= osd_req->r_num_ops); - /* return &osd_req->r_ops[which].cls.response_data; */ - return &osd_req->r_data_in; /* Response data is incoming */ + return &osd_req->r_ops[which].cls.response_data; } EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ @@ -158,9 +155,6 @@ void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); - - osd_req->r_ops[which].extent.osd_data = - osd_req_op_extent_osd_data(osd_req, which, write_request); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); @@ -172,9 +166,6 @@ void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); ceph_osd_data_pagelist_init(osd_data, pagelist); - - osd_req->r_ops[which].extent.osd_data = - osd_req_op_extent_osd_data(osd_req, which, write_request); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); @@ -187,9 +178,6 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); ceph_osd_data_bio_init(osd_data, bio, bio_length); - - osd_req->r_ops[which].extent.osd_data = - osd_req_op_extent_osd_data(osd_req, which, write_request); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); #endif /* CONFIG_BLOCK */ @@ -202,9 +190,6 @@ static void osd_req_op_cls_request_info_pagelist( osd_data = osd_req_op_cls_request_info(osd_req, which); ceph_osd_data_pagelist_init(osd_data, pagelist); - - osd_req->r_ops[which].cls.request_info = - osd_req_op_cls_request_info(osd_req, which); } void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, @@ -216,9 +201,6 @@ void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, osd_data = osd_req_op_cls_response_data(osd_req, which); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); - - osd_req->r_ops[which].cls.response_data = - osd_req_op_cls_response_data(osd_req, which); } EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); @@ -241,18 +223,39 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) } } + static void ceph_osd_data_release(struct ceph_osd_data *osd_data) { - if (osd_data->type != CEPH_OSD_DATA_TYPE_PAGES) - return; - - if (osd_data->own_pages) { + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) { int num_pages; num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); ceph_release_page_vector(osd_data->pages, num_pages); } + ceph_osd_data_init(osd_data); +} + +static void osd_req_op_data_release(struct ceph_osd_request *osd_req, + unsigned int which) +{ + struct ceph_osd_req_op *op; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; + + switch (op->op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + ceph_osd_data_release(&op->extent.osd_data); + break; + case CEPH_OSD_OP_CALL: + ceph_osd_data_release(&op->cls.request_info); + ceph_osd_data_release(&op->cls.response_data); + break; + default: + break; + } } /* @@ -261,6 +264,7 @@ static void ceph_osd_data_release(struct ceph_osd_data *osd_data) void ceph_osdc_release_request(struct kref *kref) { struct ceph_osd_request *req; + unsigned int which; req = container_of(kref, struct ceph_osd_request, r_kref); if (req->r_request) @@ -270,8 +274,8 @@ void ceph_osdc_release_request(struct kref *kref) ceph_msg_put(req->r_reply); } - ceph_osd_data_release(&req->r_data_in); - ceph_osd_data_release(&req->r_data_out); + for (which = 0; which < req->r_num_ops; which++) + osd_req_op_data_release(req, which); ceph_put_snap_context(req->r_snapc); if (req->r_mempool) @@ -595,27 +599,22 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); - if (src->op == CEPH_OSD_OP_WRITE) { - WARN_ON(src->extent.osd_data != &req->r_data_out); + if (src->op == CEPH_OSD_OP_WRITE) ceph_osdc_msg_data_set(req->r_request, - src->extent.osd_data); - } else { - WARN_ON(src->extent.osd_data != &req->r_data_in); + &src->extent.osd_data); + else ceph_osdc_msg_data_set(req->r_reply, - src->extent.osd_data); - } + &src->extent.osd_data); break; case CEPH_OSD_OP_CALL: dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len); - WARN_ON(src->cls.response_data != &req->r_data_in); - ceph_osdc_msg_data_set(req->r_reply, src->cls.response_data); - WARN_ON(src->cls.request_info != &req->r_data_out); - ceph_osdc_msg_data_set(req->r_request, src->cls.request_info); - BUG_ON(src->cls.request_info->type != + ceph_osdc_msg_data_set(req->r_reply, &src->cls.response_data); + ceph_osdc_msg_data_set(req->r_request, &src->cls.request_info); + BUG_ON(src->cls.request_info.type != CEPH_OSD_DATA_TYPE_PAGELIST); - request_data_len = src->cls.request_info->pagelist->length; + request_data_len = src->cls.request_info.pagelist->length; break; case CEPH_OSD_OP_STARTSYNC: break; -- cgit v1.2.3 From ea96571f7b865edaf1acd472e6f2cddc9fb67892 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:01 -0500 Subject: libceph: fix possible CONFIG_BLOCK build problem This patch: 15a0d7b libceph: record message data length did not enclose some bio-specific code inside CONFIG_BLOCK as it should have. Fix that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 ++ net/ceph/messenger.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index b832c0ce899a..cdeebae03e0d 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -271,8 +271,10 @@ extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment); extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, size_t length); +#endif /* CONFIG_BLOCK */ extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index a6fda9532102..994192beda02 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -817,7 +817,7 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes) return true; } -#endif +#endif /* CONFIG_BLOCK */ /* * For a page array, a piece comes from the first page in the array @@ -3011,6 +3011,7 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, } EXPORT_SYMBOL(ceph_msg_data_set_pagelist); +#ifdef CONFIG_BLOCK void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, size_t length) { @@ -3028,6 +3029,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, msg->data_length = length; } EXPORT_SYMBOL(ceph_msg_data_set_bio); +#endif /* CONFIG_BLOCK */ /* * construct a new message with given type, size -- cgit v1.2.3 From f759ebb968dbf185fc079dd2e824b1aa3a3d71aa Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:01 -0500 Subject: libceph: skip message if too big to receive We know the length of our message buffers. If we get a message that's too long, just dump it and ignore it. If skip was set then con->in_msg won't be valid, so be careful not to dereference a null pointer in the process. This resolves: http://tracker.ceph.com/issues/4664 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 994192beda02..cb5b4e6733f0 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2207,10 +2207,18 @@ static int read_partial_message(struct ceph_connection *con) ret = ceph_con_in_msg_alloc(con, &skip); if (ret < 0) return ret; + + BUG_ON(!con->in_msg ^ skip); + if (con->in_msg && data_len > con->in_msg->data_length) { + pr_warning("%s skipping long message (%u > %zd)\n", + __func__, data_len, con->in_msg->data_length); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + skip = 1; + } if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); - BUG_ON(con->in_msg); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; -- cgit v1.2.3 From c851c49591ebf000c610711e39eea7da5ff05b21 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:01 -0500 Subject: libceph: record bio length The bio is the only data item type that doesn't record its full length. Fix that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 5 ++++- net/ceph/messenger.c | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index cdeebae03e0d..4fb870a5b5fc 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -116,7 +116,10 @@ struct ceph_msg_data { enum ceph_msg_data_type type; union { #ifdef CONFIG_BLOCK - struct bio *bio; + struct { + struct bio *bio; + size_t bio_length; + }; #endif /* CONFIG_BLOCK */ struct { struct page **pages; /* NOT OWNER. */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index cb5b4e6733f0..731bb9efa2c6 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3032,6 +3032,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); BUG_ON(!data); data->bio = bio; + data->bio_length = length; msg->data = data; msg->data_length = length; -- cgit v1.2.3 From 36153ec9dd6287d7cedf6afb51453c445d946cee Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: move cursor into message A message will only be processing a single data item at a time, so there's no need for each data item to have its own cursor. Move the cursor embedded in the message data structure into the message itself. To minimize the impact, keep the data->cursor field, but make it be a pointer to the cursor in the message. Move the definition of ceph_msg_data above ceph_msg_data_cursor so the cursor can point to the data without a forward definition rather than vice-versa. This and the upcoming patches are part of: http://tracker.ceph.com/issues/3761 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 43 +++++++++++++++++++++--------------------- net/ceph/messenger.c | 35 ++++++++++++++++++---------------- 2 files changed, 41 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 4fb870a5b5fc..e7557242817c 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -88,6 +88,25 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) } } +struct ceph_msg_data { + enum ceph_msg_data_type type; + union { +#ifdef CONFIG_BLOCK + struct { + struct bio *bio; + size_t bio_length; + }; +#endif /* CONFIG_BLOCK */ + struct { + struct page **pages; /* NOT OWNER. */ + size_t length; /* total # bytes */ + unsigned int alignment; /* first page */ + }; + struct ceph_pagelist *pagelist; + }; + struct ceph_msg_data_cursor *cursor; +}; + struct ceph_msg_data_cursor { size_t resid; /* bytes not yet consumed */ bool last_piece; /* now at last piece of data item */ @@ -112,25 +131,6 @@ struct ceph_msg_data_cursor { }; }; -struct ceph_msg_data { - enum ceph_msg_data_type type; - union { -#ifdef CONFIG_BLOCK - struct { - struct bio *bio; - size_t bio_length; - }; -#endif /* CONFIG_BLOCK */ - struct { - struct page **pages; /* NOT OWNER. */ - size_t length; /* total # bytes */ - unsigned int alignment; /* first page */ - }; - struct ceph_pagelist *pagelist; - }; - struct ceph_msg_data_cursor cursor; /* pagelist only */ -}; - /* * a single message. it contains a header (src, dest, message type, etc.), * footer (crc values, mainly), a "front" message body, and possibly a @@ -142,8 +142,9 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; - size_t data_length; - struct ceph_msg_data *data; /* data payload */ + size_t data_length; + struct ceph_msg_data *data; + struct ceph_msg_data_cursor cursor; struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 731bb9efa2c6..4626da34a5c3 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -725,7 +725,7 @@ static void con_out_kvec_add(struct ceph_connection *con, static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data, size_t length) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; struct bio *bio; BUG_ON(data->type != CEPH_MSG_DATA_BIO); @@ -745,7 +745,7 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data, size_t *page_offset, size_t *length) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; struct bio *bio; struct bio_vec *bio_vec; unsigned int index; @@ -774,7 +774,7 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data, static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; struct bio *bio; struct bio_vec *bio_vec; unsigned int index; @@ -826,7 +826,7 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes) static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data, size_t length) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; int page_count; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); @@ -849,7 +849,7 @@ static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data, size_t *page_offset, size_t *length) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); @@ -868,7 +868,7 @@ static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data, static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data, size_t bytes) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); @@ -897,7 +897,7 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data, static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data, size_t length) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; struct ceph_pagelist *pagelist; struct page *page; @@ -923,7 +923,7 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data, size_t *page_offset, size_t *length) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); @@ -941,13 +941,13 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data, else *length = PAGE_SIZE - *page_offset; - return data->cursor.page; + return data->cursor->page; } static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data, size_t bytes) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); @@ -1003,7 +1003,7 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data, /* BUG(); */ break; } - data->cursor.need_crc = true; + data->cursor->need_crc = true; } /* @@ -1039,7 +1039,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data, BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); if (last_piece) - *last_piece = data->cursor.last_piece; + *last_piece = data->cursor->last_piece; return page; } @@ -1050,7 +1050,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data, */ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) { - struct ceph_msg_data_cursor *cursor = &data->cursor; + struct ceph_msg_data_cursor *cursor = data->cursor; bool new_piece; BUG_ON(bytes > cursor->resid); @@ -1071,7 +1071,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) BUG(); break; } - data->cursor.need_crc = new_piece; + data->cursor->need_crc = new_piece; return new_piece; } @@ -1404,7 +1404,7 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page, static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; - struct ceph_msg_data_cursor *cursor = &msg->data->cursor; + struct ceph_msg_data_cursor *cursor = msg->data->cursor; bool do_datacrc = !con->msgr->nocrc; u32 crc; @@ -2102,7 +2102,7 @@ static int read_partial_message_section(struct ceph_connection *con, static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = &msg->data->cursor; + struct ceph_msg_data_cursor *cursor = msg->data->cursor; const bool do_datacrc = !con->msgr->nocrc; struct page *page; size_t page_offset; @@ -2991,6 +2991,7 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); BUG_ON(!data); + data->cursor = &msg->cursor; data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; @@ -3012,6 +3013,7 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); BUG_ON(!data); + data->cursor = &msg->cursor; data->pagelist = pagelist; msg->data = data; @@ -3031,6 +3033,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); BUG_ON(!data); + data->cursor = &msg->cursor; data->bio = bio; data->bio_length = length; -- cgit v1.2.3 From 8ae4f4f5c056150d5480710ab356801e84d01a3d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: have cursor point to data Rather than having a ceph message data item point to the cursor it's associated with, have the cursor point to a data item. This will allow a message cursor to be used for more than one data item. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 8 +-- net/ceph/messenger.c | 113 ++++++++++++++++++++--------------------- 2 files changed, 59 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index e7557242817c..8846ff610502 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -104,13 +104,13 @@ struct ceph_msg_data { }; struct ceph_pagelist *pagelist; }; - struct ceph_msg_data_cursor *cursor; }; struct ceph_msg_data_cursor { - size_t resid; /* bytes not yet consumed */ - bool last_piece; /* now at last piece of data item */ - bool need_crc; /* new piece; crc update needed */ + struct ceph_msg_data *data; /* data item this describes */ + size_t resid; /* bytes not yet consumed */ + bool last_piece; /* current is last piece */ + bool need_crc; /* crc update needed */ union { #ifdef CONFIG_BLOCK struct { /* bio */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4626da34a5c3..3aa0f30c3c5e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -722,10 +722,10 @@ static void con_out_kvec_add(struct ceph_connection *con, * entry in the current bio iovec, or the first entry in the next * bio in the list. */ -static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data, +static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { - struct ceph_msg_data_cursor *cursor = data->cursor; + struct ceph_msg_data *data = cursor->data; struct bio *bio; BUG_ON(data->type != CEPH_MSG_DATA_BIO); @@ -741,11 +741,11 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data, cursor->last_piece = length <= bio->bi_io_vec[0].bv_len; } -static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data, +static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { - struct ceph_msg_data_cursor *cursor = data->cursor; + struct ceph_msg_data *data = cursor->data; struct bio *bio; struct bio_vec *bio_vec; unsigned int index; @@ -772,14 +772,14 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data, return bio_vec->bv_page; } -static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes) +static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) { - struct ceph_msg_data_cursor *cursor = data->cursor; struct bio *bio; struct bio_vec *bio_vec; unsigned int index; - BUG_ON(data->type != CEPH_MSG_DATA_BIO); + BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); bio = cursor->bio; BUG_ON(!bio); @@ -823,10 +823,10 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes) * For a page array, a piece comes from the first page in the array * that has not already been fully consumed. */ -static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data, +static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { - struct ceph_msg_data_cursor *cursor = data->cursor; + struct ceph_msg_data *data = cursor->data; int page_count; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); @@ -845,11 +845,11 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data, cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; } -static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data, - size_t *page_offset, - size_t *length) +static struct page * +ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) { - struct ceph_msg_data_cursor *cursor = data->cursor; + struct ceph_msg_data *data = cursor->data; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); @@ -865,12 +865,10 @@ static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data, return data->pages[cursor->page_index]; } -static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data, +static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { - struct ceph_msg_data_cursor *cursor = data->cursor; - - BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); @@ -894,10 +892,11 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data, * For a pagelist, a piece is whatever remains to be consumed in the * first page in the list, or the front of the next page. */ -static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data, +static void +ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { - struct ceph_msg_data_cursor *cursor = data->cursor; + struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; struct page *page; @@ -919,11 +918,11 @@ static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data, cursor->last_piece = length <= PAGE_SIZE; } -static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data, - size_t *page_offset, - size_t *length) +static struct page * +ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) { - struct ceph_msg_data_cursor *cursor = data->cursor; + struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); @@ -941,13 +940,13 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data, else *length = PAGE_SIZE - *page_offset; - return data->cursor->page; + return cursor->page; } -static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data, +static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { - struct ceph_msg_data_cursor *cursor = data->cursor; + struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); @@ -983,19 +982,21 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data, * be processed in that piece. It also tracks whether the current * piece is the last one in the data item. */ -static void ceph_msg_data_cursor_init(struct ceph_msg_data *data, - size_t length) +static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) { - switch (data->type) { + struct ceph_msg_data_cursor *cursor = &msg->cursor; + + cursor->data = msg->data; + switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: - ceph_msg_data_pagelist_cursor_init(data, length); + ceph_msg_data_pagelist_cursor_init(cursor, length); break; case CEPH_MSG_DATA_PAGES: - ceph_msg_data_pages_cursor_init(data, length); + ceph_msg_data_pages_cursor_init(cursor, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: - ceph_msg_data_bio_cursor_init(data, length); + ceph_msg_data_bio_cursor_init(cursor, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_NONE: @@ -1003,7 +1004,7 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data, /* BUG(); */ break; } - data->cursor->need_crc = true; + cursor->need_crc = true; } /* @@ -1011,23 +1012,22 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data, * data item, and supply the page offset and length of that piece. * Indicate whether this is the last piece in this data item. */ -static struct page *ceph_msg_data_next(struct ceph_msg_data *data, - size_t *page_offset, - size_t *length, +static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length, bool *last_piece) { struct page *page; - switch (data->type) { + switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: - page = ceph_msg_data_pagelist_next(data, page_offset, length); + page = ceph_msg_data_pagelist_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_PAGES: - page = ceph_msg_data_pages_next(data, page_offset, length); + page = ceph_msg_data_pages_next(cursor, page_offset, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: - page = ceph_msg_data_bio_next(data, page_offset, length); + page = ceph_msg_data_bio_next(cursor, page_offset, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_NONE: @@ -1039,7 +1039,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data, BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); if (last_piece) - *last_piece = data->cursor->last_piece; + *last_piece = cursor->last_piece; return page; } @@ -1048,22 +1048,22 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data, * Returns true if the result moves the cursor on to the next piece * of the data item. */ -static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) +static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) { - struct ceph_msg_data_cursor *cursor = data->cursor; bool new_piece; BUG_ON(bytes > cursor->resid); - switch (data->type) { + switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: - new_piece = ceph_msg_data_pagelist_advance(data, bytes); + new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); break; case CEPH_MSG_DATA_PAGES: - new_piece = ceph_msg_data_pages_advance(data, bytes); + new_piece = ceph_msg_data_pages_advance(cursor, bytes); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: - new_piece = ceph_msg_data_bio_advance(data, bytes); + new_piece = ceph_msg_data_bio_advance(cursor, bytes); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_NONE: @@ -1071,7 +1071,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes) BUG(); break; } - data->cursor->need_crc = new_piece; + cursor->need_crc = new_piece; return new_piece; } @@ -1083,7 +1083,7 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len) /* Initialize data cursor */ - ceph_msg_data_cursor_init(msg->data, (size_t)data_len); + ceph_msg_data_cursor_init(msg, (size_t)data_len); } /* @@ -1404,7 +1404,7 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page, static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; - struct ceph_msg_data_cursor *cursor = msg->data->cursor; + struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !con->msgr->nocrc; u32 crc; @@ -1430,7 +1430,7 @@ static int write_partial_message_data(struct ceph_connection *con) bool need_crc; int ret; - page = ceph_msg_data_next(msg->data, &page_offset, &length, + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, &last_piece); ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, last_piece); @@ -1442,7 +1442,7 @@ static int write_partial_message_data(struct ceph_connection *con) } if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); - need_crc = ceph_msg_data_advance(msg->data, (size_t)ret); + need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret); } dout("%s %p msg %p done\n", __func__, con, msg); @@ -2102,7 +2102,7 @@ static int read_partial_message_section(struct ceph_connection *con, static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = msg->data->cursor; + struct ceph_msg_data_cursor *cursor = &msg->cursor; const bool do_datacrc = !con->msgr->nocrc; struct page *page; size_t page_offset; @@ -2117,7 +2117,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = con->in_data_crc; while (cursor->resid) { - page = ceph_msg_data_next(msg->data, &page_offset, &length, + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) { @@ -2129,7 +2129,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = ceph_crc32c_page(crc, page, page_offset, ret); - (void) ceph_msg_data_advance(msg->data, (size_t)ret); + (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret); } if (do_datacrc) con->in_data_crc = crc; @@ -2991,7 +2991,6 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); BUG_ON(!data); - data->cursor = &msg->cursor; data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; @@ -3013,7 +3012,6 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); BUG_ON(!data); - data->cursor = &msg->cursor; data->pagelist = pagelist; msg->data = data; @@ -3033,7 +3031,6 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); BUG_ON(!data); - data->cursor = &msg->cursor; data->bio = bio; data->bio_length = length; -- cgit v1.2.3 From 5240d9f95dfe0f0701b35fbff1cb5b70825ad23f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: replace message data pointer with list In place of the message data pointer, use a list head which links through message data items. For now we only support a single entry on that list. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 3 ++- net/ceph/messenger.c | 46 ++++++++++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 8846ff610502..318da0170a1e 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -89,6 +89,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) } struct ceph_msg_data { + struct list_head links; /* ceph_msg->data */ enum ceph_msg_data_type type; union { #ifdef CONFIG_BLOCK @@ -143,7 +144,7 @@ struct ceph_msg { struct ceph_buffer *middle; size_t data_length; - struct ceph_msg_data *data; + struct list_head data; struct ceph_msg_data_cursor cursor; struct ceph_connection *con; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3aa0f30c3c5e..8bfe7d34bc85 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -985,8 +985,10 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) { struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data *data; - cursor->data = msg->data; + data = list_first_entry(&msg->data, struct ceph_msg_data, links); + cursor->data = data; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: ceph_msg_data_pagelist_cursor_init(cursor, length); @@ -1410,7 +1412,7 @@ static int write_partial_message_data(struct ceph_connection *con) dout("%s %p msg %p\n", __func__, con, msg); - if (WARN_ON(!msg->data)) + if (list_empty(&msg->data)) return -EINVAL; /* @@ -2111,7 +2113,7 @@ static int read_partial_msg_data(struct ceph_connection *con) int ret; BUG_ON(!msg); - if (!msg->data) + if (list_empty(&msg->data)) return -EIO; if (do_datacrc) @@ -2963,6 +2965,7 @@ static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) data = kzalloc(sizeof (*data), GFP_NOFS); if (data) data->type = type; + INIT_LIST_HEAD(&data->links); return data; } @@ -2972,6 +2975,7 @@ static void ceph_msg_data_destroy(struct ceph_msg_data *data) if (!data) return; + WARN_ON(!list_empty(&data->links)); if (data->type == CEPH_MSG_DATA_PAGELIST) { ceph_pagelist_release(data->pagelist); kfree(data->pagelist); @@ -2987,7 +2991,7 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, BUG_ON(!pages); BUG_ON(!length); BUG_ON(msg->data_length); - BUG_ON(msg->data != NULL); + BUG_ON(!list_empty(&msg->data)); data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); BUG_ON(!data); @@ -2995,8 +2999,9 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, data->length = length; data->alignment = alignment & ~PAGE_MASK; - msg->data = data; - msg->data_length = length; + BUG_ON(!list_empty(&msg->data)); + list_add_tail(&data->links, &msg->data); + msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_set_pages); @@ -3008,14 +3013,14 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, BUG_ON(!pagelist); BUG_ON(!pagelist->length); BUG_ON(msg->data_length); - BUG_ON(msg->data != NULL); + BUG_ON(!list_empty(&msg->data)); data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); BUG_ON(!data); data->pagelist = pagelist; - msg->data = data; - msg->data_length = pagelist->length; + list_add_tail(&data->links, &msg->data); + msg->data_length += pagelist->length; } EXPORT_SYMBOL(ceph_msg_data_set_pagelist); @@ -3027,15 +3032,15 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, BUG_ON(!bio); BUG_ON(msg->data_length); - BUG_ON(msg->data != NULL); + BUG_ON(!list_empty(&msg->data)); data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); BUG_ON(!data); data->bio = bio; data->bio_length = length; - msg->data = data; - msg->data_length = length; + list_add_tail(&data->links, &msg->data); + msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_set_bio); #endif /* CONFIG_BLOCK */ @@ -3059,6 +3064,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); + INIT_LIST_HEAD(&m->data); /* front */ m->front_max = front_len; @@ -3204,6 +3210,9 @@ void ceph_msg_kfree(struct ceph_msg *m) void ceph_msg_last_put(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); + LIST_HEAD(data); + struct list_head *links; + struct list_head *next; dout("ceph_msg_put last one on %p\n", m); WARN_ON(!list_empty(&m->list_head)); @@ -3213,8 +3222,15 @@ void ceph_msg_last_put(struct kref *kref) ceph_buffer_put(m->middle); m->middle = NULL; } - ceph_msg_data_destroy(m->data); - m->data = NULL; + + list_splice_init(&m->data, &data); + list_for_each_safe(links, next, &data) { + struct ceph_msg_data *data; + + data = list_entry(links, struct ceph_msg_data, links); + list_del_init(links); + ceph_msg_data_destroy(data); + } m->data_length = 0; if (m->pool) @@ -3227,7 +3243,7 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, - msg->front_max, msg->data->length); + msg->front_max, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); -- cgit v1.2.3 From ca8b3a69174b04376722672d7dd6b666a7f17c50 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:01 -0500 Subject: libceph: implement multiple data items in a message This patch adds support to the messenger for more than one data item in its data list. A message data cursor has two more fields to support this: - a count of the number of bytes left to be consumed across all data items in the list, "total_resid" - a pointer to the head of the list (for validation only) The cursor initialization routine has been split into two parts: the outer one, which initializes the cursor for traversing the entire list of data items; and the inner one, which initializes the cursor to start processing a single data item. When a message cursor is first initialized, the outer initialization routine sets total_resid to the length provided. The data pointer is initialized to the first data item on the list. From there, the inner initialization routine finishes by setting up to process the data item the cursor points to. Advancing the cursor consumes bytes in total_resid. If the resid field reaches zero, it means the current data item is fully consumed. If total_resid indicates there is more data, the cursor is advanced to point to the next data item, and then the inner initialization routine prepares for using that. (A check is made at this point to make sure we don't wrap around the front of the list.) The type-specific init routines are modified so they can be given a length that's larger than what the data item can support. The resid field is initialized to the smaller of the provided length and the length of the entire data item. When total_resid reaches zero, we're done. This resolves: http://tracker.ceph.com/issues/3761 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 5 ++++- net/ceph/messenger.c | 48 ++++++++++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 318da0170a1e..de1d2e1ecce2 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -108,7 +108,10 @@ struct ceph_msg_data { }; struct ceph_msg_data_cursor { - struct ceph_msg_data *data; /* data item this describes */ + size_t total_resid; /* across all data items */ + struct list_head *data_head; /* = &ceph_msg->data */ + + struct ceph_msg_data *data; /* current data item */ size_t resid; /* bytes not yet consumed */ bool last_piece; /* current is last piece */ bool need_crc; /* crc update needed */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 8bfe7d34bc85..84703e550c26 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -734,7 +734,7 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, BUG_ON(!bio); BUG_ON(!bio->bi_vcnt); - cursor->resid = length; + cursor->resid = min(length, data->bio_length); cursor->bio = bio; cursor->vector_index = 0; cursor->vector_offset = 0; @@ -833,9 +833,8 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, BUG_ON(!data->pages); BUG_ON(!data->length); - BUG_ON(length > data->length); /* short reads are OK */ - cursor->resid = length; + cursor->resid = min(length, data->length); page_count = calc_pages_for(data->alignment, (u64)data->length); cursor->page_offset = data->alignment & ~PAGE_MASK; cursor->page_index = 0; @@ -904,7 +903,6 @@ ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, pagelist = data->pagelist; BUG_ON(!pagelist); - BUG_ON(length > pagelist->length); /* short reads are OK */ if (!length) return; /* pagelist can be assigned but empty */ @@ -912,7 +910,7 @@ ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, BUG_ON(list_empty(&pagelist->head)); page = list_first_entry(&pagelist->head, struct page, lru); - cursor->resid = length; + cursor->resid = min(length, pagelist->length); cursor->page = page; cursor->offset = 0; cursor->last_piece = length <= PAGE_SIZE; @@ -982,13 +980,10 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, * be processed in that piece. It also tracks whether the current * piece is the last one in the data item. */ -static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) { - struct ceph_msg_data_cursor *cursor = &msg->cursor; - struct ceph_msg_data *data; + size_t length = cursor->total_resid; - data = list_first_entry(&msg->data, struct ceph_msg_data, links); - cursor->data = data; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: ceph_msg_data_pagelist_cursor_init(cursor, length); @@ -1009,6 +1004,25 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) cursor->need_crc = true; } +static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +{ + struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data *data; + + BUG_ON(!length); + BUG_ON(length > msg->data_length); + BUG_ON(list_empty(&msg->data)); + + data = list_first_entry(&msg->data, struct ceph_msg_data, links); + + cursor->data_head = &msg->data; + cursor->total_resid = length; + data = list_first_entry(&msg->data, struct ceph_msg_data, links); + cursor->data = data; + + __ceph_msg_data_cursor_init(cursor); +} + /* * Return the page containing the next piece to process for a given * data item, and supply the page offset and length of that piece. @@ -1073,8 +1087,16 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, BUG(); break; } + cursor->total_resid -= bytes; cursor->need_crc = new_piece; + if (!cursor->resid && cursor->total_resid) { + WARN_ON(!cursor->last_piece); + BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); + cursor->data = list_entry_next(cursor->data, links); + __ceph_msg_data_cursor_init(cursor); + } + return new_piece; } @@ -2990,8 +3012,6 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, BUG_ON(!pages); BUG_ON(!length); - BUG_ON(msg->data_length); - BUG_ON(!list_empty(&msg->data)); data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); BUG_ON(!data); @@ -3012,8 +3032,6 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, BUG_ON(!pagelist); BUG_ON(!pagelist->length); - BUG_ON(msg->data_length); - BUG_ON(!list_empty(&msg->data)); data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); BUG_ON(!data); @@ -3031,8 +3049,6 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, struct ceph_msg_data *data; BUG_ON(!bio); - BUG_ON(msg->data_length); - BUG_ON(!list_empty(&msg->data)); data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); BUG_ON(!data); -- cgit v1.2.3 From 90af36022aecdeeb1b9c0755461187de717c86dd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:01 -0500 Subject: libceph: add, don't set data for a message Change the names of the functions that put data on a pagelist to reflect that we're adding to whatever's already there rather than just setting it to the one thing. Currently only one data item is ever added to a message, but that's about to change. This resolves: http://tracker.ceph.com/issues/2770 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/mds_client.c | 4 ++-- include/linux/ceph/messenger.h | 6 +++--- net/ceph/messenger.c | 12 ++++++------ net/ceph/osd_client.c | 16 ++++++++-------- 4 files changed, 19 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 13ae44eaa980..4f22671a5bd4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1724,7 +1724,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_data_len) { /* outbound data set only by ceph_sync_setxattr() */ BUG_ON(!req->r_pages); - ceph_msg_data_set_pages(msg, req->r_pages, req->r_data_len, 0); + ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); } msg->hdr.data_len = cpu_to_le32(req->r_data_len); @@ -2608,7 +2608,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, if (pagelist->length) { /* set up outbound data if we have any */ reply->hdr.data_len = cpu_to_le32(pagelist->length); - ceph_msg_data_set_pagelist(reply, pagelist); + ceph_msg_data_add_pagelist(reply, pagelist); } ceph_con_send(&session->s_con, reply); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index de1d2e1ecce2..7c1420bb1dce 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -275,12 +275,12 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); extern void ceph_con_keepalive(struct ceph_connection *con); -extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, +extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment); -extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, +extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK -extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, +extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, size_t length); #endif /* CONFIG_BLOCK */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 84703e550c26..a36d98d8073e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3005,7 +3005,7 @@ static void ceph_msg_data_destroy(struct ceph_msg_data *data) kfree(data); } -void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, +void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment) { struct ceph_msg_data *data; @@ -3023,9 +3023,9 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, list_add_tail(&data->links, &msg->data); msg->data_length += length; } -EXPORT_SYMBOL(ceph_msg_data_set_pages); +EXPORT_SYMBOL(ceph_msg_data_add_pages); -void ceph_msg_data_set_pagelist(struct ceph_msg *msg, +void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist) { struct ceph_msg_data *data; @@ -3040,10 +3040,10 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg, list_add_tail(&data->links, &msg->data); msg->data_length += pagelist->length; } -EXPORT_SYMBOL(ceph_msg_data_set_pagelist); +EXPORT_SYMBOL(ceph_msg_data_add_pagelist); #ifdef CONFIG_BLOCK -void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, size_t length) { struct ceph_msg_data *data; @@ -3058,7 +3058,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, list_add_tail(&data->links, &msg->data); msg->data_length += length; } -EXPORT_SYMBOL(ceph_msg_data_set_bio); +EXPORT_SYMBOL(ceph_msg_data_add_bio); #endif /* CONFIG_BLOCK */ /* diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2562e4e52245..73227853d845 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -550,7 +550,7 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_watch_init); -static void ceph_osdc_msg_data_set(struct ceph_msg *msg, +static void ceph_osdc_msg_data_add(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { u64 length = ceph_osd_data_length(osd_data); @@ -558,14 +558,14 @@ static void ceph_osdc_msg_data_set(struct ceph_msg *msg, if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { BUG_ON(length > (u64) SIZE_MAX); if (length) - ceph_msg_data_set_pages(msg, osd_data->pages, + ceph_msg_data_add_pages(msg, osd_data->pages, length, osd_data->alignment); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { BUG_ON(!length); - ceph_msg_data_set_pagelist(msg, osd_data->pagelist); + ceph_msg_data_add_pagelist(msg, osd_data->pagelist); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - ceph_msg_data_set_bio(msg, osd_data->bio, length); + ceph_msg_data_add_bio(msg, osd_data->bio, length); #endif } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); @@ -600,18 +600,18 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); if (src->op == CEPH_OSD_OP_WRITE) - ceph_osdc_msg_data_set(req->r_request, + ceph_osdc_msg_data_add(req->r_request, &src->extent.osd_data); else - ceph_osdc_msg_data_set(req->r_reply, + ceph_osdc_msg_data_add(req->r_reply, &src->extent.osd_data); break; case CEPH_OSD_OP_CALL: dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len); - ceph_osdc_msg_data_set(req->r_reply, &src->cls.response_data); - ceph_osdc_msg_data_set(req->r_request, &src->cls.request_info); + ceph_osdc_msg_data_add(req->r_reply, &src->cls.response_data); + ceph_osdc_msg_data_add(req->r_request, &src->cls.request_info); BUG_ON(src->cls.request_info.type != CEPH_OSD_DATA_TYPE_PAGELIST); request_data_len = src->cls.request_info.pagelist->length; -- cgit v1.2.3 From 04017e29bbcf0673d8a6af616c56e395d05f5971 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:02 -0500 Subject: libceph: make method call data be a separate data item Right now the data for a method call is specified via a pointer and length, and it's copied--along with the class and method name--into a pagelist data item to be sent to the osd. Instead, encode the data in a data item separate from the class and method names. This will allow large amounts of data to be supplied to methods without copying. Only rbd uses the class functionality right now, and when it really needs this it will probably need to use a page array rather than a page list. But this simple implementation demonstrates the functionality on the osd client, and that's enough for now. This resolves: http://tracker.ceph.com/issues/4104 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 15 ++++++++-- include/linux/ceph/osd_client.h | 10 +++---- net/ceph/osd_client.c | 62 +++++++++++++++++++++++++++++------------ 3 files changed, 62 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6f7a52cf75c7..11b7987cb75f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1847,8 +1847,19 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, goto out; osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, - class_name, method_name, - outbound, outbound_size); + class_name, method_name); + if (outbound_size) { + struct ceph_pagelist *pagelist; + + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + if (!pagelist) + goto out; + + ceph_pagelist_init(pagelist); + ceph_pagelist_append(pagelist, outbound, outbound_size); + osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, + pagelist); + } osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, obj_request->pages, inbound_size, 0, false, false); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4ec46c0ceaf7..2a68a7465c18 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -92,10 +92,9 @@ struct ceph_osd_req_op { struct { const char *class_name; const char *method_name; - const void *request_data; struct ceph_osd_data request_info; + struct ceph_osd_data request_data; struct ceph_osd_data response_data; - u32 request_data_len; __u8 class_len; __u8 method_len; __u8 argc; @@ -259,6 +258,9 @@ extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, struct bio *bio, size_t bio_length); #endif /* CONFIG_BLOCK */ +extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, + unsigned int which, + struct ceph_pagelist *pagelist); extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, @@ -267,9 +269,7 @@ extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, - const char *class, const char *method, - const void *request_data, - size_t request_data_size); + const char *class, const char *method); extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 73227853d845..939be67199ca 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -135,6 +135,16 @@ osd_req_op_cls_request_info(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_cls_request_info); /* ??? */ +struct ceph_osd_data * +osd_req_op_cls_request_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + return &osd_req->r_ops[which].cls.request_data; +} +EXPORT_SYMBOL(osd_req_op_cls_request_data); /* ??? */ + struct ceph_osd_data * osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, unsigned int which) @@ -192,6 +202,17 @@ static void osd_req_op_cls_request_info_pagelist( ceph_osd_data_pagelist_init(osd_data, pagelist); } +void osd_req_op_cls_request_data_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_cls_request_data(osd_req, which); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); + void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) @@ -251,6 +272,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, break; case CEPH_OSD_OP_CALL: ceph_osd_data_release(&op->cls.request_info); + ceph_osd_data_release(&op->cls.request_data); ceph_osd_data_release(&op->cls.response_data); break; default: @@ -492,8 +514,7 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, EXPORT_SYMBOL(osd_req_op_extent_update); void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, - u16 opcode, const char *class, const char *method, - const void *request_data, size_t request_data_size) + u16 opcode, const char *class, const char *method) { struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); struct ceph_pagelist *pagelist; @@ -520,12 +541,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_pagelist_append(pagelist, method, size); payload_len += size; - op->cls.request_data = request_data; - BUG_ON(request_data_size > (size_t) U32_MAX); - op->cls.request_data_len = (u32) request_data_size; - ceph_pagelist_append(pagelist, request_data, request_data_size); - payload_len += request_data_size; - osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); op->cls.argc = 0; /* currently unused */ @@ -576,7 +591,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, struct ceph_osd_op *dst, unsigned int which) { struct ceph_osd_req_op *src; + struct ceph_osd_data *osd_data; u64 request_data_len = 0; + u64 data_length; BUG_ON(which >= req->r_num_ops); src = &req->r_ops[which]; @@ -599,22 +616,31 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); + osd_data = &src->extent.osd_data; if (src->op == CEPH_OSD_OP_WRITE) - ceph_osdc_msg_data_add(req->r_request, - &src->extent.osd_data); + ceph_osdc_msg_data_add(req->r_request, osd_data); else - ceph_osdc_msg_data_add(req->r_reply, - &src->extent.osd_data); + ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_CALL: dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; - dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len); - ceph_osdc_msg_data_add(req->r_reply, &src->cls.response_data); - ceph_osdc_msg_data_add(req->r_request, &src->cls.request_info); - BUG_ON(src->cls.request_info.type != - CEPH_OSD_DATA_TYPE_PAGELIST); - request_data_len = src->cls.request_info.pagelist->length; + osd_data = &src->cls.request_info; + ceph_osdc_msg_data_add(req->r_request, osd_data); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); + request_data_len = osd_data->pagelist->length; + + osd_data = &src->cls.request_data; + data_length = ceph_osd_data_length(osd_data); + if (data_length) { + BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); + dst->cls.indata_len = cpu_to_le32(data_length); + ceph_osdc_msg_data_add(req->r_request, osd_data); + src->payload_len += data_length; + request_data_len += data_length; + } + osd_data = &src->cls.response_data; + ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_STARTSYNC: break; -- cgit v1.2.3 From 26be88087ae8a04a5b576aa2f490597b649fc132 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Apr 2013 11:20:42 -0500 Subject: libceph: change how "safe" callback is used An osd request currently has two callbacks. They inform the initiator of the request when we've received confirmation for the target osd that a request was received, and when the osd indicates all changes described by the request are durable. The only time the second callback is used is in the ceph file system for a synchronous write. There's a race that makes some handling of this case unsafe. This patch addresses this problem. The error handling for this callback is also kind of gross, and this patch changes that as well. In ceph_sync_write(), if a safe callback is requested we want to add the request on the ceph inode's unsafe items list. Because items on this list must have their tid set (by ceph_osd_start_request()), the request added *after* the call to that function returns. The problem with this is that there's a race between starting the request and adding it to the unsafe items list; the request may already be complete before ceph_sync_write() even begins to put it on the list. To address this, we change the way the "safe" callback is used. Rather than just calling it when the request is "safe", we use it to notify the initiator the bounds (start and end) of the period during which the request is *unsafe*. So the initiator gets notified just before the request gets sent to the osd (when it is "unsafe"), and again when it's known the results are durable (it's no longer unsafe). The first call will get made in __send_request(), just before the request message gets sent to the messenger for the first time. That function is only called by __send_queued(), which is always called with the osd client's request mutex held. We then have this callback function insert the request on the ceph inode's unsafe list when we're told the request is unsafe. This will avoid the race because this call will be made under protection of the osd client's request mutex. It also nicely groups the setup and cleanup of the state associated with managing unsafe requests. The name of the "safe" callback field is changed to "unsafe" to better reflect its new purpose. It has a Boolean "unsafe" parameter to indicate whether the request is becoming unsafe or is now safe. Because the "msg" parameter wasn't used, we drop that. This resolves the original problem reportedin: http://tracker.ceph.com/issues/4706 Reported-by: Yan, Zheng Signed-off-by: Alex Elder Reviewed-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/file.c | 52 ++++++++++++++++++++++------------------- include/linux/ceph/osd_client.h | 4 +++- net/ceph/osd_client.c | 12 +++++++--- 3 files changed, 40 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ae23e31a8f38..a65acf355384 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -446,19 +446,35 @@ done: } /* - * Write commit callback, called if we requested both an ACK and - * ONDISK commit reply from the OSD. + * Write commit request unsafe callback, called to tell us when a + * request is unsafe (that is, in flight--has been handed to the + * messenger to send to its target osd). It is called again when + * we've received a response message indicating the request is + * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request + * is completed early (and unsuccessfully) due to a timeout or + * interrupt. + * + * This is used if we requested both an ACK and ONDISK commit reply + * from the OSD. */ -static void sync_write_commit(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_inode); - dout("sync_write_commit %p tid %llu\n", req, req->r_tid); - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, + unsafe ? "un" : ""); + if (unsafe) { + ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); + spin_unlock(&ci->i_unsafe_lock); + } else { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } } /* @@ -570,7 +586,8 @@ more: if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */ - req->r_safe_callback = sync_write_commit; + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; own_pages = true; } } @@ -581,21 +598,8 @@ more: ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) { - if (req->r_safe_callback) { - /* - * Add to inode unsafe list only after we - * start_request so that a tid has been assigned. - */ - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_item, - &ci->i_unsafe_writes); - spin_unlock(&ci->i_unsafe_lock); - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - } - + if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - } if (file->f_flags & O_DIRECT) ceph_put_page_vector(pages, num_pages, false); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2a68a7465c18..0d3358ef5285 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -29,6 +29,7 @@ struct ceph_authorizer; */ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, struct ceph_msg *); +typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); /* a given osd we're communicating with */ struct ceph_osd { @@ -149,7 +150,8 @@ struct ceph_osd_request { struct kref r_kref; bool r_mempool; struct completion r_completion, r_safe_completion; - ceph_osdc_callback_t r_callback, r_safe_callback; + ceph_osdc_callback_t r_callback; + ceph_osdc_unsafe_callback_t r_unsafe_callback; struct ceph_eversion r_reassert_version; struct list_head r_unsafe_item; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 939be67199ca..0c5bf2fb5075 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1314,8 +1314,14 @@ static void __send_request(struct ceph_osd_client *osdc, list_move_tail(&req->r_req_lru_item, &osdc->req_lru); ceph_msg_get(req->r_request); /* send consumes a ref */ - ceph_con_send(&req->r_osd->o_con, req->r_request); + + /* Mark the request unsafe if this is the first timet's being sent. */ + + if (!req->r_sent && req->r_unsafe_callback) + req->r_unsafe_callback(req, true); req->r_sent = req->r_osd->o_incarnation; + + ceph_con_send(&req->r_osd->o_con, req->r_request); } /* @@ -1403,8 +1409,8 @@ static void handle_osds_timeout(struct work_struct *work) static void complete_request(struct ceph_osd_request *req) { - if (req->r_safe_callback) - req->r_safe_callback(req, NULL); + if (req->r_unsafe_callback) + req->r_unsafe_callback(req, false); complete_all(&req->r_safe_completion); /* fsync waiter */ } -- cgit v1.2.3 From 406e2c9f9286fc93ae2191a7abf477dea05aadc9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Apr 2013 14:50:36 -0500 Subject: libceph: kill off osd data write_request parameters In the incremental move toward supporting distinct data items in an osd request some of the functions had "write_request" parameters to indicate, basically, whether the data belonged to in_data or the out_data. Now that we maintain the data fields in the op structure there is no need to indicate the direction, so get rid of the "write_request" parameters. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ++-- fs/ceph/addr.c | 9 ++++----- fs/ceph/file.c | 4 ++-- include/linux/ceph/osd_client.h | 8 ++++---- net/ceph/osd_client.c | 25 +++++++++++-------------- 5 files changed, 23 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 13a381b2a779..8e8b876e83c3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1779,7 +1779,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 0, 0); - osd_req_op_extent_osd_data_bio(osd_req, 0, write_request, + osd_req_op_extent_osd_data_bio(osd_req, 0, obj_request->bio_list, obj_request->length); rbd_osd_req_format(obj_request, write_request); @@ -2281,7 +2281,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, offset, length, 0, 0); - osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false, + osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, obj_request->pages, obj_request->length, obj_request->offset & ~PAGE_MASK, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2d6466b5fe82..3e68ac101040 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -245,7 +245,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - osd_data = osd_req_op_extent_osd_data(req, 0, false); + osd_data = osd_req_op_extent_osd_data(req, 0); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); @@ -343,8 +343,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - osd_req_op_extent_osd_data_pages(req, 0, false, pages, len, 0, - false, false); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; @@ -571,7 +570,7 @@ static void writepages_finish(struct ceph_osd_request *req, long writeback_stat; unsigned issued = ceph_caps_issued(ci); - osd_data = osd_req_op_extent_osd_data(req, 0, true); + osd_data = osd_req_op_extent_osd_data(req, 0); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); @@ -916,7 +915,7 @@ get_more_pages: dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, 0, + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, !!pool, false); pages = NULL; /* request message now owns the pages array */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7e94dcb66d92..d70830c66833 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -585,8 +585,8 @@ more: own_pages = true; } } - osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, - page_align, false, own_pages); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0d3358ef5285..0e406934a551 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -241,22 +241,22 @@ extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, - unsigned int which, bool write_request); + unsigned int which); extern struct ceph_osd_data *osd_req_op_cls_response_data( struct ceph_osd_request *osd_req, unsigned int which); extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, - unsigned int which, bool write_request, + unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages); extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, - unsigned int which, bool write_request, + unsigned int which, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, - unsigned int which, bool write_request, + unsigned int which, struct bio *bio, size_t bio_length); #endif /* CONFIG_BLOCK */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0c5bf2fb5075..409c443c8d1f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -117,7 +117,7 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, struct ceph_osd_data * osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, - unsigned int which, bool write_request) + unsigned int which) { BUG_ON(which >= osd_req->r_num_ops); @@ -156,37 +156,34 @@ osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, - unsigned int which, bool write_request, - struct page **pages, u64 length, u32 alignment, + unsigned int which, struct page **pages, + u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + osd_data = osd_req_op_extent_osd_data(osd_req, which); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, - unsigned int which, bool write_request, - struct ceph_pagelist *pagelist) + unsigned int which, struct ceph_pagelist *pagelist) { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + osd_data = osd_req_op_extent_osd_data(osd_req, which); ceph_osd_data_pagelist_init(osd_data, pagelist); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, - unsigned int which, bool write_request, - struct bio *bio, size_t bio_length) + unsigned int which, struct bio *bio, size_t bio_length) { struct ceph_osd_data *osd_data; - - osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + osd_data = osd_req_op_extent_osd_data(osd_req, which); ceph_osd_data_bio_init(osd_data, bio, bio_length); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); @@ -2284,7 +2281,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, /* it may be a short read due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, false, + osd_req_op_extent_osd_data_pages(req, 0, pages, *plen, page_align, false, false); dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", @@ -2327,7 +2324,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, page_align, + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, false, false); dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); @@ -2428,7 +2425,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, * XXX page data. Probably OK for reads, but this * XXX ought to be done more generally. */ - osd_data = osd_req_op_extent_osd_data(req, 0, false); + osd_data = osd_req_op_extent_osd_data(req, 0); if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { if (osd_data->pages && unlikely(osd_data->length < data_len)) { -- cgit v1.2.3 From 863c7eb590c154c7c2cfac40914f5bedcad1a166 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Apr 2013 14:50:36 -0500 Subject: libceph: clean up osd data field access functions There are a bunch of functions defined to encapsulate getting the address of a data field for a particular op in an osd request. They're all defined the same way, so create a macro to take the place of all of them. Two of these are used outside the osd client code, so preserve them (but convert them to use the new macro internally). Stop exporting the ones that aren't used elsewhere. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 48 +++++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 409c443c8d1f..3c0715977de3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -115,43 +115,25 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, } #endif /* CONFIG_BLOCK */ +#define osd_req_op_data(oreq, whch, typ, fld) \ + ({ \ + BUG_ON(whch >= (oreq)->r_num_ops); \ + &(oreq)->r_ops[whch].typ.fld; \ + }) + struct ceph_osd_data * osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, unsigned int which) { - BUG_ON(which >= osd_req->r_num_ops); - - return &osd_req->r_ops[which].extent.osd_data; + return osd_req_op_data(osd_req, which, extent, osd_data); } EXPORT_SYMBOL(osd_req_op_extent_osd_data); -struct ceph_osd_data * -osd_req_op_cls_request_info(struct ceph_osd_request *osd_req, - unsigned int which) -{ - BUG_ON(which >= osd_req->r_num_ops); - - return &osd_req->r_ops[which].cls.request_info; -} -EXPORT_SYMBOL(osd_req_op_cls_request_info); /* ??? */ - -struct ceph_osd_data * -osd_req_op_cls_request_data(struct ceph_osd_request *osd_req, - unsigned int which) -{ - BUG_ON(which >= osd_req->r_num_ops); - - return &osd_req->r_ops[which].cls.request_data; -} -EXPORT_SYMBOL(osd_req_op_cls_request_data); /* ??? */ - struct ceph_osd_data * osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, unsigned int which) { - BUG_ON(which >= osd_req->r_num_ops); - - return &osd_req->r_ops[which].cls.response_data; + return osd_req_op_data(osd_req, which, cls, response_data); } EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ @@ -162,7 +144,7 @@ void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_extent_osd_data(osd_req, which); + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } @@ -173,7 +155,7 @@ void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_extent_osd_data(osd_req, which); + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_pagelist_init(osd_data, pagelist); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); @@ -183,7 +165,8 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, unsigned int which, struct bio *bio, size_t bio_length) { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_extent_osd_data(osd_req, which); + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_bio_init(osd_data, bio, bio_length); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); @@ -195,7 +178,7 @@ static void osd_req_op_cls_request_info_pagelist( { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_cls_request_info(osd_req, which); + osd_data = osd_req_op_data(osd_req, which, cls, request_info); ceph_osd_data_pagelist_init(osd_data, pagelist); } @@ -205,7 +188,7 @@ void osd_req_op_cls_request_data_pagelist( { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_cls_request_data(osd_req, which); + osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_pagelist_init(osd_data, pagelist); } EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); @@ -216,7 +199,7 @@ void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_cls_response_data(osd_req, which); + osd_data = osd_req_op_data(osd_req, which, cls, response_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } @@ -241,7 +224,6 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) } } - static void ceph_osd_data_release(struct ceph_osd_data *osd_data) { if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) { -- cgit v1.2.3 From 49719778bfa5371ec9b5a7d989bb29000e3ac5df Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Feb 2013 12:33:24 -0600 Subject: libceph: support raw data requests Allow osd request ops that aren't otherwise structured (not class, extent, or watch ops) to specify "raw" data to be used to hold incoming data for the op. Make use of this capability for the osd STAT op. Prefix the name of the private function osd_req_op_init() with "_", and expose a new function by that (earlier) name whose purpose is to initialize osd ops with (only) implied data. For now we'll just support the use of a page array for an osd op with incoming raw data. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 10 ++++++++++ net/ceph/osd_client.c | 38 ++++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0e406934a551..4d84a2b44f18 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -84,6 +84,7 @@ struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ u32 payload_len; union { + struct ceph_osd_data raw_data_in; struct { u64 offset, length; u64 truncate_size; @@ -232,6 +233,15 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +extern void osd_req_op_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode); + +extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, + unsigned int which, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); + extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 offset, u64 length, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3c0715977de3..c842e877d504 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -121,6 +121,14 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, &(oreq)->r_ops[whch].typ.fld; \ }) +static struct ceph_osd_data * +osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + return &osd_req->r_ops[which].raw_data_in; +} + struct ceph_osd_data * osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, unsigned int which) @@ -137,6 +145,19 @@ osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ +void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, + u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_raw_data_in(osd_req, which); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_raw_data_in_pages); + void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, @@ -437,7 +458,7 @@ static bool osd_req_opcode_valid(u16 opcode) * common init routine for all the other init functions, below. */ static struct ceph_osd_req_op * -osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, +_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode) { struct ceph_osd_req_op *op; @@ -452,12 +473,19 @@ osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, return op; } +void osd_req_op_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode) +{ + (void)_osd_req_op_init(osd_req, which, opcode); +} +EXPORT_SYMBOL(osd_req_op_init); + void osd_req_op_extent_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { - struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); @@ -495,7 +523,7 @@ EXPORT_SYMBOL(osd_req_op_extent_update); void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method) { - struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; @@ -532,7 +560,7 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag) { - struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); @@ -584,6 +612,8 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, switch (src->op) { case CEPH_OSD_OP_STAT: + osd_data = &src->raw_data_in; + ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: -- cgit v1.2.3 From a51b272e9e99f912e8e07d4c9f58c1d433afea7c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:49 -0500 Subject: libceph: fix two messenger bugs This patch makes four small changes in the ceph messenger. While getting copyup functionality working I found two bugs in the messenger. Existing paths through the code did not trigger these problems, but they're fixed here: - In ceph_msg_data_pagelist_cursor_init(), the cursor's last_piece field was being checked against the length supplied. This was OK until this commit: ccba6d98 libceph: implement multiple data items in a message That commit changed the cursor init routines to allow lengths to be supplied that exceeded the size of the current data item. Because of this, we have to use the assigned cursor resid field rather than the provided length in determining whether the cursor points to the last piece of a data item. - In ceph_msg_data_add_pages(), a BUG_ON() was erroneously catching attempts to add page data to a message if the message already had data assigned to it. That was OK until that same commit, at which point it was fine for messages to have multiple data items. It slipped through because that BUG_ON() call was present twice in that function. (You can never be too careful.) In addition two other minor things are changed: - In ceph_msg_data_cursor_init(), the local variable "data" was getting assigned twice. - In ceph_msg_data_advance(), it was assumed that the type-specific advance routine would set new_piece to true after it advanced past the last piece. That may have been fine, but since we check for that case we might as well set it explicitly in ceph_msg_data_advance(). This resolves: http://tracker.ceph.com/issues/4762 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index a36d98d8073e..91dd45113c7b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -913,7 +913,7 @@ ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, cursor->resid = min(length, pagelist->length); cursor->page = page; cursor->offset = 0; - cursor->last_piece = length <= PAGE_SIZE; + cursor->last_piece = cursor->resid <= PAGE_SIZE; } static struct page * @@ -1013,8 +1013,6 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) BUG_ON(length > msg->data_length); BUG_ON(list_empty(&msg->data)); - data = list_first_entry(&msg->data, struct ceph_msg_data, links); - cursor->data_head = &msg->data; cursor->total_resid = length; data = list_first_entry(&msg->data, struct ceph_msg_data, links); @@ -1088,14 +1086,15 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, break; } cursor->total_resid -= bytes; - cursor->need_crc = new_piece; if (!cursor->resid && cursor->total_resid) { WARN_ON(!cursor->last_piece); BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); cursor->data = list_entry_next(cursor->data, links); __ceph_msg_data_cursor_init(cursor); + new_piece = true; } + cursor->need_crc = new_piece; return new_piece; } @@ -3019,7 +3018,6 @@ void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, data->length = length; data->alignment = alignment & ~PAGE_MASK; - BUG_ON(!list_empty(&msg->data)); list_add_tail(&data->links, &msg->data); msg->data_length += length; } -- cgit v1.2.3 From 6c57b5545d46e276381a15a59283c984cf3f94e3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:49 -0500 Subject: libceph: support pages for class request data Add the ability to provide an array of pages as outbound request data for object class method calls. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 5 +++++ net/ceph/osd_client.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4d84a2b44f18..4191cd2c55e5 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -273,6 +273,11 @@ extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, unsigned int which, struct ceph_pagelist *pagelist); +extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, + unsigned int which, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index c842e877d504..467020c560b2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -214,6 +214,18 @@ void osd_req_op_cls_request_data_pagelist( } EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); +void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); + void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) -- cgit v1.2.3 From 9ef1ee5a1b6ccb3220fb822523716e56c3629dbe Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 21 Apr 2013 16:51:50 -0500 Subject: libceph: fix byte order mismatch A WATCH op includes an object version. The version that's supplied is incorrectly byte-swapped osd_req_op_watch_init() where it's first assigned (it's been this way since that code was first added). The result is that the version sent to the osd is wrong, because that value gets byte-swapped again in osd_req_encode_op(). This is the source of a sparse warning related to improper byte order in the assignment. The approach of using the version to avoid a race is deprecated (see http://tracker.ceph.com/issues/3871), and the watch parameter is no longer even examined by the osd. So fix the assignment in osd_req_op_watch_init() so it no longer does the byte swap. This resolves: http://tracker.ceph.com/issues/3847 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 467020c560b2..57d8db5b055a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -577,8 +577,7 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); op->watch.cookie = cookie; - /* op->watch.ver = version; */ /* XXX 3847 */ - op->watch.ver = cpu_to_le64(version); + op->watch.ver = version; if (opcode == CEPH_OSD_OP_WATCH && flag) op->watch.flag = (u8)1; } -- cgit v1.2.3 From 4f0dcb10cf1454a1c38aeaa04cb2757535e4905e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:32 -0500 Subject: libceph: create source file "net/ceph/snapshot.c" This creates a new source file "net/ceph/snapshot.c" to contain utility routines related to ceph snapshot contexts. The main motivation was to define ceph_create_snap_context() as a common way to create these structures, but I've moved the definitions of ceph_get_snap_context() and ceph_put_snap_context() there too. (The benefit of inlining those is very small, and I'd rather keep this collection of functions together.) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/libceph.h | 30 +++-------------- net/ceph/Makefile | 2 +- net/ceph/snapshot.c | 78 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 26 deletions(-) create mode 100644 net/ceph/snapshot.c (limited to 'net') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 5493d7b86423..2e3024881a5e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -157,31 +157,11 @@ struct ceph_snap_context { u64 snaps[]; }; -static inline struct ceph_snap_context * -ceph_get_snap_context(struct ceph_snap_context *sc) -{ - /* - printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)+1); - */ - if (sc) - atomic_inc(&sc->nref); - return sc; -} - -static inline void ceph_put_snap_context(struct ceph_snap_context *sc) -{ - if (!sc) - return; - /* - printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)-1); - */ - if (atomic_dec_and_test(&sc->nref)) { - /*printk(" deleting snap_context %p\n", sc);*/ - kfree(sc); - } -} +extern struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, + gfp_t gfp_flags); +extern struct ceph_snap_context *ceph_get_snap_context( + struct ceph_snap_context *sc); +extern void ceph_put_snap_context(struct ceph_snap_context *sc); /* * calculate the number of pages a given length and offset map onto, diff --git a/net/ceph/Makefile b/net/ceph/Makefile index e87ef435e11b..958d9856912c 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ - pagevec.o + pagevec.o snapshot.o diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c new file mode 100644 index 000000000000..154683f5f14c --- /dev/null +++ b/net/ceph/snapshot.c @@ -0,0 +1,78 @@ +/* + * snapshot.c Ceph snapshot context utility routines (part of libceph) + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include + +#include +#include +#include + +/* + * Ceph snapshot contexts are reference counted objects, and the + * returned structure holds a single reference. Acquire additional + * references with ceph_get_snap_context(), and release them with + * ceph_put_snap_context(). When the reference count reaches zero + * the entire structure is freed. + */ + +/* + * Create a new ceph snapshot context large enough to hold the + * indicated number of snapshot ids (which can be 0). Caller has + * to fill in snapc->seq and snapc->snaps[0..snap_count-1]. + * + * Returns a null pointer if an error occurs. + */ +struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, + gfp_t gfp_flags) +{ + struct ceph_snap_context *snapc; + size_t size; + + size = sizeof (struct ceph_snap_context); + size += snap_count * sizeof (snapc->snaps[0]); + snapc = kzalloc(size, gfp_flags); + if (!snapc) + return NULL; + + atomic_set(&snapc->nref, 1); + snapc->num_snaps = snap_count; + + return snapc; +} +EXPORT_SYMBOL(ceph_create_snap_context); + +struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc) +{ + if (sc) + atomic_inc(&sc->nref); + return sc; +} +EXPORT_SYMBOL(ceph_get_snap_context); + +void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} +EXPORT_SYMBOL(ceph_put_snap_context); -- cgit v1.2.3 From e3d5d6380482b4a5e2e9d0d662f2ef6d56504aef Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:04 -0500 Subject: libceph: allocate ceph messages with a slab allocator Create a slab cache to manage ceph_msg structure allocation. This is part of: http://tracker.ceph.com/issues/3926 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 91dd45113c7b..bc1ba4c2605d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -152,6 +152,10 @@ static bool con_flag_test_and_set(struct ceph_connection *con, return test_and_set_bit(con_flag, &con->flags); } +/* Slab caches for frequently-allocated structures */ + +static struct kmem_cache *ceph_msg_cache; + /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; @@ -226,6 +230,22 @@ static void encode_my_addr(struct ceph_messenger *msgr) */ static struct workqueue_struct *ceph_msgr_wq; +static int ceph_msgr_slab_init(void) +{ + BUG_ON(ceph_msg_cache); + ceph_msg_cache = kmem_cache_create("ceph_msg", + sizeof (struct ceph_msg), + __alignof__(struct ceph_msg), 0, NULL); + return ceph_msg_cache ? 0 : -ENOMEM; +} + +static void ceph_msgr_slab_exit(void) +{ + BUG_ON(!ceph_msg_cache); + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; +} + static void _ceph_msgr_exit(void) { if (ceph_msgr_wq) { @@ -233,6 +253,8 @@ static void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } + ceph_msgr_slab_exit(); + BUG_ON(zero_page == NULL); kunmap(zero_page); page_cache_release(zero_page); @@ -245,6 +267,9 @@ int ceph_msgr_init(void) zero_page = ZERO_PAGE(0); page_cache_get(zero_page); + if (ceph_msgr_slab_init()) + return -ENOMEM; + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (ceph_msgr_wq) return 0; @@ -3068,7 +3093,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, { struct ceph_msg *m; - m = kzalloc(sizeof(*m), flags); + m = kmem_cache_zalloc(ceph_msg_cache, flags); if (m == NULL) goto out; @@ -3215,7 +3240,7 @@ void ceph_msg_kfree(struct ceph_msg *m) vfree(m->front.iov_base); else kfree(m->front.iov_base); - kfree(m); + kmem_cache_free(ceph_msg_cache, m); } /* -- cgit v1.2.3 From 81b36be4c56299ac4c4c786908cb117ad232b62e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:04 -0500 Subject: libceph: allocate ceph message data with a slab allocator Create a slab cache to manage ceph_msg_data structure allocation. This is part of: http://tracker.ceph.com/issues/3926 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/messenger.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index bc1ba4c2605d..eb0a46a49bd4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -155,6 +155,7 @@ static bool con_flag_test_and_set(struct ceph_connection *con, /* Slab caches for frequently-allocated structures */ static struct kmem_cache *ceph_msg_cache; +static struct kmem_cache *ceph_msg_data_cache; /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; @@ -236,11 +237,30 @@ static int ceph_msgr_slab_init(void) ceph_msg_cache = kmem_cache_create("ceph_msg", sizeof (struct ceph_msg), __alignof__(struct ceph_msg), 0, NULL); - return ceph_msg_cache ? 0 : -ENOMEM; + + if (!ceph_msg_cache) + return -ENOMEM; + + BUG_ON(ceph_msg_data_cache); + ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", + sizeof (struct ceph_msg_data), + __alignof__(struct ceph_msg_data), + 0, NULL); + if (ceph_msg_data_cache) + return 0; + + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; + + return -ENOMEM; } static void ceph_msgr_slab_exit(void) { + BUG_ON(!ceph_msg_data_cache); + kmem_cache_destroy(ceph_msg_data_cache); + ceph_msg_data_cache = NULL; + BUG_ON(!ceph_msg_cache); kmem_cache_destroy(ceph_msg_cache); ceph_msg_cache = NULL; @@ -3008,7 +3028,7 @@ static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) if (WARN_ON(!ceph_msg_data_type_valid(type))) return NULL; - data = kzalloc(sizeof (*data), GFP_NOFS); + data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); if (data) data->type = type; INIT_LIST_HEAD(&data->links); @@ -3026,7 +3046,7 @@ static void ceph_msg_data_destroy(struct ceph_msg_data *data) ceph_pagelist_release(data->pagelist); kfree(data->pagelist); } - kfree(data); + kmem_cache_free(ceph_msg_data_cache, data); } void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, -- cgit v1.2.3 From 5522ae0b68421e2645303ff010e27afc5292e0ab Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:04 -0500 Subject: libceph: use slab cache for osd client requests Create a slab cache to manage allocation of ceph_osdc_request structures. This resolves: http://tracker.ceph.com/issues/3926 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 3 +++ net/ceph/ceph_common.c | 7 +++++++ net/ceph/osd_client.c | 27 +++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4191cd2c55e5..186db0bf4951 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -224,6 +224,9 @@ struct ceph_osd_client { struct workqueue_struct *notify_wq; }; +extern int ceph_osdc_setup(void); +extern void ceph_osdc_cleanup(void); + extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index e65e6e4be38b..34b11ee8124e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -606,11 +606,17 @@ static int __init init_ceph_lib(void) if (ret < 0) goto out_crypto; + ret = ceph_osdc_setup(); + if (ret < 0) + goto out_msgr; + pr_info("loaded (mon/osd proto %d/%d)\n", CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); return 0; +out_msgr: + ceph_msgr_exit(); out_crypto: ceph_crypto_shutdown(); out_debugfs: @@ -622,6 +628,7 @@ out: static void __exit exit_ceph_lib(void) { dout("exit_ceph_lib\n"); + ceph_osdc_cleanup(); ceph_msgr_exit(); ceph_crypto_shutdown(); ceph_debugfs_cleanup(); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 57d8db5b055a..a3395fdfbd4f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -22,6 +22,8 @@ #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 +static struct kmem_cache *ceph_osd_request_cache; + static const struct ceph_connection_operations osd_con_ops; static void __send_queued(struct ceph_osd_client *osdc); @@ -315,7 +317,8 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else - kfree(req); + kmem_cache_free(ceph_osd_request_cache, req); + } EXPORT_SYMBOL(ceph_osdc_release_request); @@ -346,7 +349,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req = mempool_alloc(osdc->req_mempool, gfp_flags); memset(req, 0, sizeof(*req)); } else { - req = kzalloc(sizeof(*req), gfp_flags); + req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); } if (req == NULL) return NULL; @@ -2365,6 +2368,26 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } EXPORT_SYMBOL(ceph_osdc_writepages); +int ceph_osdc_setup(void) +{ + BUG_ON(ceph_osd_request_cache); + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", + sizeof (struct ceph_osd_request), + __alignof__(struct ceph_osd_request), + 0, NULL); + + return ceph_osd_request_cache ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(ceph_osdc_setup); + +void ceph_osdc_cleanup(void) +{ + BUG_ON(!ceph_osd_request_cache); + kmem_cache_destroy(ceph_osd_request_cache); + ceph_osd_request_cache = NULL; +} +EXPORT_SYMBOL(ceph_osdc_cleanup); + /* * handle incoming message */ -- cgit v1.2.3