From 3e699bd865527004773012da38febdf444fd5fa8 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 30 Jun 2020 03:52:15 -0400 Subject: ceph: add check_session_state() helper and make it global And remove the unsed mdsc parameter to simplify the code. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++-------------------- fs/ceph/mds_client.h | 3 +++ 2 files changed, 30 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a50497142e59..58c54d41aa40 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1785,8 +1785,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc, /* * send a session close request */ -static int request_close_session(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static int request_close_session(struct ceph_mds_session *session) { struct ceph_msg *msg; @@ -1809,7 +1808,7 @@ static int __close_session(struct ceph_mds_client *mdsc, if (session->s_state >= CEPH_MDS_SESSION_CLOSING) return 0; session->s_state = CEPH_MDS_SESSION_CLOSING; - return request_close_session(mdsc, session); + return request_close_session(session); } static bool drop_negative_children(struct dentry *dentry) @@ -4263,6 +4262,29 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) ceph_force_reconnect(fsc->sb); } +bool check_session_state(struct ceph_mds_session *s) +{ + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout("resending session close request for mds%d\n", + s->s_mds); + request_close_session(s); + return false; + } + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { + if (s->s_state == CEPH_MDS_SESSION_OPEN) { + s->s_state = CEPH_MDS_SESSION_HUNG; + pr_info("mds%d hung\n", s->s_mds); + } + } + if (s->s_state == CEPH_MDS_SESSION_NEW || + s->s_state == CEPH_MDS_SESSION_RESTARTING || + s->s_state == CEPH_MDS_SESSION_REJECTED) + /* this mds is failed or recovering, just wait */ + return false; + + return true; +} + /* * delayed work -- periodically trim expired leases, renew caps with mds */ @@ -4294,23 +4316,8 @@ static void delayed_work(struct work_struct *work) struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); if (!s) continue; - if (s->s_state == CEPH_MDS_SESSION_CLOSING) { - dout("resending session close request for mds%d\n", - s->s_mds); - request_close_session(mdsc, s); - ceph_put_mds_session(s); - continue; - } - if (s->s_ttl && time_after(jiffies, s->s_ttl)) { - if (s->s_state == CEPH_MDS_SESSION_OPEN) { - s->s_state = CEPH_MDS_SESSION_HUNG; - pr_info("mds%d hung\n", s->s_mds); - } - } - if (s->s_state == CEPH_MDS_SESSION_NEW || - s->s_state == CEPH_MDS_SESSION_RESTARTING || - s->s_state == CEPH_MDS_SESSION_REJECTED) { - /* this mds is failed or recovering, just wait */ + + if (!check_session_state(s)) { ceph_put_mds_session(s); continue; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 5e0c4073a6be..6147ff0a1cdf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -18,6 +18,7 @@ #include #include "metric.h" +#include "super.h" /* The first 8 bits are reserved for old ceph releases */ enum ceph_feature_type { @@ -476,6 +477,8 @@ struct ceph_mds_client { extern const char *ceph_mds_op_name(int op); +extern bool check_session_state(struct ceph_mds_session *s); + extern struct ceph_mds_session * __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); -- cgit v1.2.3 From 4f1d756def68588b88068af1d5a4a3b6dc7e6e2a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 30 Jun 2020 03:52:16 -0400 Subject: ceph: add global total_caps to count the mdsc's total caps number This will help to reduce using the global mdsc->mutex lock in many places. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 ++ fs/ceph/debugfs.c | 14 ++------------ fs/ceph/mds_client.c | 1 + fs/ceph/metric.c | 1 + fs/ceph/metric.h | 1 + 5 files changed, 7 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 972c13aa4225..5f4894063a73 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -668,6 +668,7 @@ void ceph_add_cap(struct inode *inode, spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; + atomic64_inc(&mdsc->metric.total_caps); spin_unlock(&session->s_cap_lock); } else { spin_lock(&session->s_cap_lock); @@ -1161,6 +1162,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) } else { list_del_init(&cap->session_caps); session->s_nr_caps--; + atomic64_dec(&mdsc->metric.total_caps); cap->session = NULL; removed = 1; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 070ed8481340..3030f5585085 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -145,7 +145,7 @@ static int metric_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client_metric *m = &mdsc->metric; - int i, nr_caps = 0; + int nr_caps = 0; s64 total, sum, avg, min, max, sq; seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); @@ -190,17 +190,7 @@ static int metric_show(struct seq_file *s, void *p) percpu_counter_sum(&m->d_lease_mis), percpu_counter_sum(&m->d_lease_hit)); - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) { - struct ceph_mds_session *s; - - s = __ceph_lookup_mds_session(mdsc, i); - if (!s) - continue; - nr_caps += s->s_nr_caps; - ceph_put_mds_session(s); - } - mutex_unlock(&mdsc->mutex); + nr_caps = atomic64_read(&m->total_caps); seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps, percpu_counter_sum(&m->i_caps_mis), percpu_counter_sum(&m->i_caps_hit)); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 58c54d41aa40..f3c71230df93 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1485,6 +1485,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; + atomic64_dec(&session->s_mdsc->metric.total_caps); if (cap->queue_release) __ceph_queue_cap_release(session, cap); else diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 9217f35bc2b9..269eacbd2a15 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -22,6 +22,7 @@ int ceph_metric_init(struct ceph_client_metric *m) if (ret) goto err_d_lease_mis; + atomic64_set(&m->total_caps, 0); ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL); if (ret) goto err_i_caps_hit; diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index ccd81285a450..23a3373d5a3d 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -12,6 +12,7 @@ struct ceph_client_metric { struct percpu_counter d_lease_hit; struct percpu_counter d_lease_mis; + atomic64_t total_caps; struct percpu_counter i_caps_hit; struct percpu_counter i_caps_mis; -- cgit v1.2.3 From b682c6d41bc23353b5d80e02ca4961ac67624f4c Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 30 Jun 2020 03:52:18 -0400 Subject: ceph: switch to WARN_ON_ONCE in encode_supported_features() ...and let the errnos bubble up to the callers. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f3c71230df93..d5e523cc40e6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1168,7 +1168,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8) -static void encode_supported_features(void **p, void *end) +static int encode_supported_features(void **p, void *end) { static const size_t count = ARRAY_SIZE(feature_bits); @@ -1176,16 +1176,22 @@ static void encode_supported_features(void **p, void *end) size_t i; size_t size = FEATURE_BYTES(count); - BUG_ON(*p + 4 + size > end); + if (WARN_ON_ONCE(*p + 4 + size > end)) + return -ERANGE; + ceph_encode_32(p, size); memset(*p, 0, size); for (i = 0; i < count; i++) ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8); *p += size; } else { - BUG_ON(*p + 4 > end); + if (WARN_ON_ONCE(*p + 4 > end)) + return -ERANGE; + ceph_encode_32(p, 0); } + + return 0; } /* @@ -1203,6 +1209,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; size_t size, count; void *p, *end; + int ret; const char* metadata[][2] = { {"hostname", mdsc->nodename}, @@ -1232,7 +1239,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 GFP_NOFS, false); if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); - return NULL; + return ERR_PTR(-ENOMEM); } p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -1269,7 +1276,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 p += val_len; } - encode_supported_features(&p, end); + ret = encode_supported_features(&p, end); + if (ret) { + pr_err("encode_supported_features failed!\n"); + ceph_msg_put(msg); + return ERR_PTR(ret); + } + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -1297,8 +1310,8 @@ static int __open_session(struct ceph_mds_client *mdsc, /* send connect message */ msg = create_session_open_msg(mdsc, session->s_seq); - if (!msg) - return -ENOMEM; + if (IS_ERR(msg)) + return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } @@ -1312,6 +1325,7 @@ static struct ceph_mds_session * __open_export_target_session(struct ceph_mds_client *mdsc, int target) { struct ceph_mds_session *session; + int ret; session = __ceph_lookup_mds_session(mdsc, target); if (!session) { @@ -1320,8 +1334,11 @@ __open_export_target_session(struct ceph_mds_client *mdsc, int target) return session; } if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) - __open_session(mdsc, session); + session->s_state == CEPH_MDS_SESSION_CLOSING) { + ret = __open_session(mdsc, session); + if (ret) + return ERR_PTR(ret); + } return session; } @@ -2520,7 +2537,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_copy(&p, &ts, sizeof(ts)); } - BUG_ON(p > end); + if (WARN_ON_ONCE(p > end)) { + ceph_msg_put(msg); + msg = ERR_PTR(-ERANGE); + goto out_free2; + } + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2756,7 +2778,9 @@ static void __do_request(struct ceph_mds_client *mdsc, } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { - __open_session(mdsc, session); + err = __open_session(mdsc, session); + if (err) + goto out_session; /* retry the same mds later */ if (random) req->r_resend_mds = mds; -- cgit v1.2.3 From fa9967734227b44acb1b6918033f9122dc7825b9 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 1 Jul 2020 01:52:48 -0400 Subject: ceph: fix potential mdsc use-after-free crash Make sure the delayed work stopped before releasing the resources. cancel_delayed_work_sync() will only guarantee that the work finishes executing if the work is already in the ->worklist. That means after the cancel_delayed_work_sync() returns, it will leave the work requeued if it was rearmed at the end. That can lead to a use after free once the work struct is freed. Fix it by flushing the delayed work instead of trying to cancel it, and ensure that the work doesn't rearm if the mdsc is stopping. URL: https://tracker.ceph.com/issues/46293 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d5e523cc40e6..9a09d12569bd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4330,6 +4330,9 @@ static void delayed_work(struct work_struct *work) dout("mdsc delayed_work\n"); + if (mdsc->stopping) + return; + mutex_lock(&mdsc->mutex); renew_interval = mdsc->mdsmap->m_session_timeout >> 2; renew_caps = time_after_eq(jiffies, HZ*renew_interval + @@ -4689,7 +4692,16 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { dout("stop\n"); - cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + /* + * Make sure the delayed work stopped before releasing + * the resources. + * + * Because the cancel_delayed_work_sync() will only + * guarantee that the work finishes executing. But the + * delayed work will re-arm itself again after that. + */ + flush_delayed_work(&mdsc->delayed_work); + if (mdsc->mdsmap) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); -- cgit v1.2.3 From 585d72f33e7083972030fac7792ea3050a4a8dff Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 30 Jun 2020 15:36:21 -0400 Subject: ceph: clean up and optimize ceph_check_delayed_caps() Make this loop look a bit more sane. Also optimize away the spinlock release/reacquire if we can't get an inode reference. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5f4894063a73..55ccccf77cea 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4189,10 +4189,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) struct ceph_inode_info *ci; dout("check_delayed_caps\n"); - while (1) { - spin_lock(&mdsc->cap_delay_lock); - if (list_empty(&mdsc->cap_delay_list)) - break; + spin_lock(&mdsc->cap_delay_lock); + while (!list_empty(&mdsc->cap_delay_list)) { ci = list_first_entry(&mdsc->cap_delay_list, struct ceph_inode_info, i_cap_delay_list); @@ -4202,13 +4200,13 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) list_del_init(&ci->i_cap_delay_list); inode = igrab(&ci->vfs_inode); - spin_unlock(&mdsc->cap_delay_lock); - if (inode) { + spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); ceph_check_caps(ci, 0, NULL); /* avoid calling iput_final() in tick thread */ ceph_async_iput(inode); + spin_lock(&mdsc->cap_delay_lock); } } spin_unlock(&mdsc->cap_delay_lock); -- cgit v1.2.3 From d1d9655052606fd9078e896668ec90191372d513 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 6 Jul 2020 08:51:35 -0400 Subject: ceph: do not access the kiocb after aio requests In aio case, if the completion comes very fast just before the ceph_read_iter() returns to fs/aio.c, the kiocb will be freed in the completion callback, then if ceph_read_iter() access again we will potentially hit the use-after-free bug. [ jlayton: initialize direct_lock early, and use it everywhere ] URL: https://tracker.ceph.com/issues/45649 Signed-off-by: Xiubo Li Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 160644ddaeed..d51c3f2fdca0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1538,6 +1538,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct page *pinned_page = NULL; + bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; int want, got = 0; int retry_op = 0, read = 0; @@ -1546,7 +1547,7 @@ again: dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_start_io_direct(inode); else ceph_start_io_read(inode); @@ -1603,7 +1604,7 @@ again: } ceph_put_cap_refs(ci, got); - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_read(inode); -- cgit v1.2.3 From c00e4522adff010141ce86839bb4fe494c853077 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Wed, 8 Jul 2020 07:03:22 +0000 Subject: ceph: remove unnecessary cast in kfree() Remove unnecassary casts in the argument to kfree. Signed-off-by: Xu Wang Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 71ee34d160c3..3a733ac33d9b 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -497,10 +497,10 @@ static int __set_xattr(struct ceph_inode_info *ci, kfree(*newxattr); *newxattr = NULL; if (xattr->should_free_val) - kfree((void *)xattr->val); + kfree(xattr->val); if (update_xattr) { - kfree((void *)name); + kfree(name); name = xattr->name; } ci->i_xattrs.names_size -= xattr->name_len; @@ -566,9 +566,9 @@ static void __free_xattr(struct ceph_inode_xattr *xattr) BUG_ON(!xattr); if (xattr->should_free_name) - kfree((void *)xattr->name); + kfree(xattr->name); if (xattr->should_free_val) - kfree((void *)xattr->val); + kfree(xattr->val); kfree(xattr); } @@ -582,9 +582,9 @@ static int __remove_xattr(struct ceph_inode_info *ci, rb_erase(&xattr->node, &ci->i_xattrs.index); if (xattr->should_free_name) - kfree((void *)xattr->name); + kfree(xattr->name); if (xattr->should_free_val) - kfree((void *)xattr->val); + kfree(xattr->val); ci->i_xattrs.names_size -= xattr->name_len; ci->i_xattrs.vals_size -= xattr->val_len; -- cgit v1.2.3 From 94f17c00d6687993101372f996cf6690ec9adf83 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Wed, 8 Jul 2020 08:53:28 +0200 Subject: libceph: replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. [ idryomov: Do the same for the CRUSH paper and replace ceph.newdream.net with ceph.io. ] Signed-off-by: Alexander A. Klimov Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/Kconfig | 2 +- include/linux/crush/crush.h | 2 +- net/ceph/Kconfig | 2 +- net/ceph/ceph_hash.c | 2 +- net/ceph/crush/hash.c | 2 +- net/ceph/crush/mapper.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index cf235f6eacf9..471e40156065 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -13,7 +13,7 @@ config CEPH_FS scalable file system designed to provide high performance, reliable access to petabytes of storage. - More information at http://ceph.newdream.net/. + More information at https://ceph.io/. If unsure, say N. diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 33c16f2de7f6..2f811baf78d2 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -17,7 +17,7 @@ * The algorithm was originally described in detail in this paper * (although the algorithm has evolved somewhat since then): * - * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf + * https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf * * LGPL2 */ diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index d7bec7adc267..f36f9a3a4e20 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -13,7 +13,7 @@ config CEPH_LIB common functionality to both the Ceph filesystem and to the rados block device (rbd). - More information at http://ceph.newdream.net/. + More information at https://ceph.io/. If unsure, say N. diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 9a5850f264ed..81e1e006c540 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -4,7 +4,7 @@ /* * Robert Jenkin's hash function. - * http://burtleburtle.net/bob/hash/evahash.html + * https://burtleburtle.net/bob/hash/evahash.html * This is in the public domain. */ #define mix(a, b, c) \ diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c index e5cc603cdb17..fe79f6d2d0db 100644 --- a/net/ceph/crush/hash.c +++ b/net/ceph/crush/hash.c @@ -7,7 +7,7 @@ /* * Robert Jenkins' function for mixing 32-bit values - * http://burtleburtle.net/bob/hash/evahash.html + * https://burtleburtle.net/bob/hash/evahash.html * a, b = random bits, c = input and output */ #define crush_hashmix(a, b, c) do { \ diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 3f323ed9df52..07e5614eb3f1 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -298,7 +298,7 @@ static __u64 crush_ln(unsigned int xin) * * for reference, see: * - * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables + * https://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables * */ -- cgit v1.2.3 From aaf5a476201bf93bdab75d6922340516ee63f7e2 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 17 Jul 2020 09:25:13 -0400 Subject: ceph: check the sesion state and return false in case it is closed If the session is already in closed state, we should skip it. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9a09d12569bd..ef8a1179171b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4303,6 +4303,7 @@ bool check_session_state(struct ceph_mds_session *s) } if (s->s_state == CEPH_MDS_SESSION_NEW || s->s_state == CEPH_MDS_SESSION_RESTARTING || + s->s_state == CEPH_MDS_SESSION_CLOSED || s->s_state == CEPH_MDS_SESSION_REJECTED) /* this mds is failed or recovering, just wait */ return false; -- cgit v1.2.3 From 18f473b384a64cef69f166a3e2b73d3d2eca82c6 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 16 Jul 2020 10:05:57 -0400 Subject: ceph: periodically send perf metrics to MDSes This will send the caps/read/write/metadata metrics to any available MDS once per second, which will be the same as the userland client. It will skip the MDS sessions which don't support the metric collection, as the MDSs will close socket connections when they get an unknown type message. We can disable the metric sending via the disable_send_metrics module parameter. [ jlayton: fix up endianness bug in ceph_mdsc_send_metrics() ] URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 + fs/ceph/mds_client.h | 4 +- fs/ceph/metric.c | 148 +++++++++++++++++++++++++++++++++++++++++++ fs/ceph/metric.h | 77 ++++++++++++++++++++++ fs/ceph/super.c | 42 ++++++++++++ fs/ceph/super.h | 2 + include/linux/ceph/ceph_fs.h | 1 + 7 files changed, 276 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ef8a1179171b..d6cd2e4f0bc8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3334,6 +3334,8 @@ static void handle_session(struct ceph_mds_session *session, session->s_state = CEPH_MDS_SESSION_OPEN; session->s_features = features; renewed_caps(mdsc, session, 0); + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) + metric_schedule_delayed(&mdsc->metric); wake = 1; if (mdsc->stopping) __close_session(mdsc, session); @@ -4725,6 +4727,7 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) ceph_metric_destroy(&mdsc->metric); + flush_delayed_work(&mdsc->metric.delayed_work); fsc->mdsc = NULL; kfree(mdsc); dout("mdsc_destroy %p done\n", mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 6147ff0a1cdf..bc9e95937d7c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -28,8 +28,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_LAZY_CAP_WANTED, CEPHFS_FEATURE_MULTI_RECONNECT, CEPHFS_FEATURE_DELEG_INO, + CEPHFS_FEATURE_METRIC_COLLECT, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, }; /* @@ -43,6 +44,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_LAZY_CAP_WANTED, \ CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ + CEPHFS_FEATURE_METRIC_COLLECT, \ \ CEPHFS_FEATURE_MAX, \ } diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 269eacbd2a15..2466b261fba2 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -1,10 +1,150 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #include #include #include #include "metric.h" +#include "mds_client.h" + +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + struct ceph_metric_head *head; + struct ceph_metric_cap *cap; + struct ceph_metric_read_latency *read; + struct ceph_metric_write_latency *write; + struct ceph_metric_metadata_latency *meta; + struct ceph_client_metric *m = &mdsc->metric; + u64 nr_caps = atomic64_read(&m->total_caps); + struct ceph_msg *msg; + struct timespec64 ts; + s64 sum; + s32 items = 0; + s32 len; + + len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + + sizeof(*meta); + + msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); + if (!msg) { + pr_err("send metrics to mds%d, failed to allocate message\n", + s->s_mds); + return false; + } + + head = msg->front.iov_base; + + /* encode the cap metric */ + cap = (struct ceph_metric_cap *)(head + 1); + cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); + cap->ver = 1; + cap->compat = 1; + cap->data_len = cpu_to_le32(sizeof(*cap) - 10); + cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit)); + cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis)); + cap->total = cpu_to_le64(nr_caps); + items++; + + /* encode the read latency metric */ + read = (struct ceph_metric_read_latency *)(cap + 1); + read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); + read->ver = 1; + read->compat = 1; + read->data_len = cpu_to_le32(sizeof(*read) - 10); + sum = m->read_latency_sum; + jiffies_to_timespec64(sum, &ts); + read->sec = cpu_to_le32(ts.tv_sec); + read->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + /* encode the write latency metric */ + write = (struct ceph_metric_write_latency *)(read + 1); + write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); + write->ver = 1; + write->compat = 1; + write->data_len = cpu_to_le32(sizeof(*write) - 10); + sum = m->write_latency_sum; + jiffies_to_timespec64(sum, &ts); + write->sec = cpu_to_le32(ts.tv_sec); + write->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + /* encode the metadata latency metric */ + meta = (struct ceph_metric_metadata_latency *)(write + 1); + meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); + meta->ver = 1; + meta->compat = 1; + meta->data_len = cpu_to_le32(sizeof(*meta) - 10); + sum = m->metadata_latency_sum; + jiffies_to_timespec64(sum, &ts); + meta->sec = cpu_to_le32(ts.tv_sec); + meta->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + put_unaligned_le32(items, &head->num); + msg->front.iov_len = len; + msg->hdr.version = cpu_to_le16(1); + msg->hdr.compat_version = cpu_to_le16(1); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("client%llu send metrics to mds%d\n", + ceph_client_gid(mdsc->fsc->client), s->s_mds); + ceph_con_send(&s->s_con, msg); + + return true; +} + + +static void metric_get_session(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *s; + int i; + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + + /* + * Skip it if MDS doesn't support the metric collection, + * or the MDS will close the session's socket connection + * directly when it get this message. + */ + if (check_session_state(s) && + test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) { + mdsc->metric.session = s; + break; + } + + ceph_put_mds_session(s); + } + mutex_unlock(&mdsc->mutex); +} + +static void metric_delayed_work(struct work_struct *work) +{ + struct ceph_client_metric *m = + container_of(work, struct ceph_client_metric, delayed_work.work); + struct ceph_mds_client *mdsc = + container_of(m, struct ceph_mds_client, metric); + + if (mdsc->stopping) + return; + + if (!m->session || !check_session_state(m->session)) { + if (m->session) { + ceph_put_mds_session(m->session); + m->session = NULL; + } + metric_get_session(mdsc); + } + if (m->session) { + ceph_mdsc_send_metrics(mdsc, m->session); + metric_schedule_delayed(m); + } +} int ceph_metric_init(struct ceph_client_metric *m) { @@ -52,6 +192,9 @@ int ceph_metric_init(struct ceph_client_metric *m) m->total_metadatas = 0; m->metadata_latency_sum = 0; + m->session = NULL; + INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work); + return 0; err_i_caps_mis: @@ -73,6 +216,11 @@ void ceph_metric_destroy(struct ceph_client_metric *m) percpu_counter_destroy(&m->i_caps_hit); percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); + + cancel_delayed_work_sync(&m->delayed_work); + + if (m->session) + ceph_put_mds_session(m->session); } static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 23a3373d5a3d..fe5d07d2e63a 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -6,6 +6,71 @@ #include #include +extern bool disable_send_metrics; + +enum ceph_metric_type { + CLIENT_METRIC_TYPE_CAP_INFO, + CLIENT_METRIC_TYPE_READ_LATENCY, + CLIENT_METRIC_TYPE_WRITE_LATENCY, + CLIENT_METRIC_TYPE_METADATA_LATENCY, + CLIENT_METRIC_TYPE_DENTRY_LEASE, + + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, +}; + +/* metric caps header */ +struct ceph_metric_cap { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(hit + mis + total) */ + __le64 hit; + __le64 mis; + __le64 total; +} __packed; + +/* metric read latency header */ +struct ceph_metric_read_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __packed; + +/* metric write latency header */ +struct ceph_metric_write_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __packed; + +/* metric metadata latency header */ +struct ceph_metric_metadata_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __packed; + +struct ceph_metric_head { + __le32 num; /* the number of metrics that will be sent */ +} __packed; + /* This is the global metrics */ struct ceph_client_metric { atomic64_t total_dentries; @@ -36,8 +101,20 @@ struct ceph_client_metric { ktime_t metadata_latency_sq_sum; ktime_t metadata_latency_min; ktime_t metadata_latency_max; + + struct ceph_mds_session *session; + struct delayed_work delayed_work; /* delayed work */ }; +static inline void metric_schedule_delayed(struct ceph_client_metric *m) +{ + if (disable_send_metrics) + return; + + /* per second */ + schedule_delayed_work(&m->delayed_work, round_jiffies_relative(HZ)); +} + extern int ceph_metric_init(struct ceph_client_metric *m); extern void ceph_metric_destroy(struct ceph_client_metric *m); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c9784eb1159a..933f5df5da7d 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -27,6 +27,9 @@ #include #include +static DEFINE_SPINLOCK(ceph_fsc_lock); +static LIST_HEAD(ceph_fsc_list); + /* * Ceph superblock operations * @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->wb_pagevec_pool) goto fail_cap_wq; + spin_lock(&ceph_fsc_lock); + list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); + spin_unlock(&ceph_fsc_lock); + return fsc; fail_cap_wq: @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + spin_lock(&ceph_fsc_lock); + list_del(&fsc->metric_wakeup); + spin_unlock(&ceph_fsc_lock); + ceph_mdsc_destroy(fsc); destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); @@ -1282,6 +1293,37 @@ static void __exit exit_ceph(void) destroy_caches(); } +static int param_set_metrics(const char *val, const struct kernel_param *kp) +{ + struct ceph_fs_client *fsc; + int ret; + + ret = param_set_bool(val, kp); + if (ret) { + pr_err("Failed to parse sending metrics switch value '%s'\n", + val); + return ret; + } else if (!disable_send_metrics) { + // wake up all the mds clients + spin_lock(&ceph_fsc_lock); + list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) { + metric_schedule_delayed(&fsc->mdsc->metric); + } + spin_unlock(&ceph_fsc_lock); + } + + return 0; +} + +static const struct kernel_param_ops param_ops_metrics = { + .set = param_set_metrics, + .get = param_get_bool, +}; + +bool disable_send_metrics = false; +module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); +MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); + module_init(init_ceph); module_exit(exit_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5a6cdd39bc10..2dcb6a90c636 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -101,6 +101,8 @@ struct ceph_mount_options { struct ceph_fs_client { struct super_block *sb; + struct list_head metric_wakeup; + struct ceph_mount_options *mount_options; struct ceph_client *client; diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index ebf5ba62b772..455e9b9e2adf 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -130,6 +130,7 @@ struct ceph_dir_layout { #define CEPH_MSG_CLIENT_REQUEST 24 #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 #define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_METRICS 29 #define CEPH_MSG_CLIENT_CAPS 0x310 #define CEPH_MSG_CLIENT_LEASE 0x311 #define CEPH_MSG_CLIENT_SNAP 0x312 -- cgit v1.2.3 From 3b4168dd8b1d3e0bb129cf41e6bb50e217fe7781 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 16 Jul 2020 10:05:58 -0400 Subject: ceph: send client provided metric flags in client metadata Send metric flags to the MDS, indicating what metrics the client supports. Currently that consists of cap statistics, and read, write and metadata latencies. URL: https://tracker.ceph.com/issues/43435 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/ceph/metric.h | 13 ++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d6cd2e4f0bc8..af7221d1c610 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1194,6 +1194,48 @@ static int encode_supported_features(void **p, void *end) return 0; } +static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED; +#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8) +static int encode_metric_spec(void **p, void *end) +{ + static const size_t count = ARRAY_SIZE(metric_bits); + + /* header */ + if (WARN_ON_ONCE(*p + 2 > end)) + return -ERANGE; + + ceph_encode_8(p, 1); /* version */ + ceph_encode_8(p, 1); /* compat */ + + if (count > 0) { + size_t i; + size_t size = METRIC_BYTES(count); + + if (WARN_ON_ONCE(*p + 4 + 4 + size > end)) + return -ERANGE; + + /* metric spec info length */ + ceph_encode_32(p, 4 + size); + + /* metric spec */ + ceph_encode_32(p, size); + memset(*p, 0, size); + for (i = 0; i < count; i++) + ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8); + *p += size; + } else { + if (WARN_ON_ONCE(*p + 4 + 4 > end)) + return -ERANGE; + + /* metric spec info length */ + ceph_encode_32(p, 4); + /* metric spec */ + ceph_encode_32(p, 0); + } + + return 0; +} + /* * session message, specialization for CEPH_SESSION_REQUEST_OPEN * to include additional client metadata fields. @@ -1234,6 +1276,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 size = FEATURE_BYTES(count); extra_bytes += 4 + size; + /* metric spec */ + size = 0; + count = ARRAY_SIZE(metric_bits); + if (count > 0) + size = METRIC_BYTES(count); + extra_bytes += 2 + 4 + 4 + size; + /* Allocate the message */ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); @@ -1252,9 +1301,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map * - * ClientSession messages with metadata are v3 + * ClientSession messages with metadata are v4 */ - msg->hdr.version = cpu_to_le16(3); + msg->hdr.version = cpu_to_le16(4); msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ @@ -1283,6 +1332,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 return ERR_PTR(ret); } + ret = encode_metric_spec(&p, end); + if (ret) { + pr_err("encode_metric_spec failed!\n"); + ceph_msg_put(msg); + return ERR_PTR(ret); + } + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index fe5d07d2e63a..1d0959d669d7 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -18,6 +18,19 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, }; +/* + * This will always have the highest metric bit value + * as the last element of the array. + */ +#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ + CLIENT_METRIC_TYPE_CAP_INFO, \ + CLIENT_METRIC_TYPE_READ_LATENCY, \ + CLIENT_METRIC_TYPE_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + \ + CLIENT_METRIC_TYPE_MAX, \ +} + /* metric caps header */ struct ceph_metric_cap { __le32 type; /* ceph metric type */ -- cgit v1.2.3 From f1f565a26976612121f97464f9245307422d0ce8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Jul 2020 16:36:04 -0700 Subject: ceph: delete repeated words in fs/ceph/ Drop duplicated words "down" and "the" in fs/ceph/. [ idryomov: merge into a single patch ] Signed-off-by: Randy Dunlap Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 2 +- fs/ceph/super.h | 2 +- include/linux/ceph/ceph_features.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 933f5df5da7d..585aecea5cad 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -839,7 +839,7 @@ static void destroy_caches(void) } /* - * ceph_umount_begin - initiate forced umount. Tear down down the + * ceph_umount_begin - initiate forced umount. Tear down the * mount, skipping steps that may hang while waiting for server(s). */ static void ceph_umount_begin(struct super_block *sb) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2dcb6a90c636..9001a896ae8c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -355,7 +355,7 @@ struct ceph_inode_info { unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ /* - * Link to the the auth cap's session's s_cap_dirty list. s_cap_dirty + * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty * is protected by the mdsc->cap_dirty_lock, but each individual item * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty * requires the mdsc->cap_dirty_lock. List presence for an item can diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 39e6f4c57580..fcd84e8d88f4 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -58,7 +58,7 @@ * because 10.2.z (jewel) did not care if its peers advertised this * feature bit. * - * - In the second phase we stop advertising the the bit and call it + * - In the second phase we stop advertising the bit and call it * RETIRED. This can normally be done in the *next* major release * following the one in which we marked the feature DEPRECATED. In * the above example, for 12.0.z (luminous) we can say: -- cgit v1.2.3 From 8e298deb8d8c449d87acceab8c8bfef71b67b08d Mon Sep 17 00:00:00 2001 From: Jia Yang Date: Thu, 23 Jul 2020 10:25:52 +0800 Subject: ceph: remove unused variables in ceph_mdsmap_decode() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build warnings: fs/ceph/mdsmap.c: In function ‘ceph_mdsmap_decode’: fs/ceph/mdsmap.c:192:7: warning: variable ‘info_cv’ set but not used [-Wunused-but-set-variable] fs/ceph/mdsmap.c:177:7: warning: variable ‘state_seq’ set but not used [-Wunused-but-set-variable] fs/ceph/mdsmap.c:123:15: warning: variable ‘mdsmap_cv’ set but not used [-Wunused-but-set-variable] Note that p is increased in ceph_decode_*. Signed-off-by: Jia Yang Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/mdsmap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 889627817e52..e4aba6c6d3b5 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -120,7 +120,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) const void *start = *p; int i, j, n; int err; - u8 mdsmap_v, mdsmap_cv; + u8 mdsmap_v; u16 mdsmap_ev; m = kzalloc(sizeof(*m), GFP_NOFS); @@ -129,7 +129,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_decode_need(p, end, 1 + 1, bad); mdsmap_v = ceph_decode_8(p); - mdsmap_cv = ceph_decode_8(p); + *p += sizeof(u8); /* mdsmap_cv */ if (mdsmap_v >= 4) { u32 mdsmap_len; ceph_decode_32_safe(p, end, mdsmap_len, bad); @@ -174,7 +174,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u64 global_id; u32 namelen; s32 mds, inc, state; - u64 state_seq; u8 info_v; void *info_end = NULL; struct ceph_entity_addr addr; @@ -189,9 +188,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info_v= ceph_decode_8(p); if (info_v >= 4) { u32 info_len; - u8 info_cv; ceph_decode_need(p, end, 1 + sizeof(u32), bad); - info_cv = ceph_decode_8(p); + *p += sizeof(u8); /* info_cv */ info_len = ceph_decode_32(p); info_end = *p + info_len; if (info_end > end) @@ -210,7 +208,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) mds = ceph_decode_32(p); inc = ceph_decode_32(p); state = ceph_decode_32(p); - state_seq = ceph_decode_64(p); + *p += sizeof(u64); /* state_seq */ err = ceph_decode_entity_addr(p, end, &addr); if (err) goto corrupt; -- cgit v1.2.3 From a7caa88f8b72c136f9a401f498471b8a8e35370d Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 23 Jul 2020 15:32:25 +0800 Subject: ceph: fix use-after-free for fsc->mdsc If the ceph_mdsc_init() fails, it will free the mdsc already. Reported-by: syzbot+b57f46d8d6ea51960b8c@syzkaller.appspotmail.com Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index af7221d1c610..590822fab767 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4453,7 +4453,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) goto err_mdsc; } - fsc->mdsc = mdsc; init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); @@ -4508,6 +4507,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) strscpy(mdsc->nodename, utsname()->nodename, sizeof(mdsc->nodename)); + + fsc->mdsc = mdsc; return 0; err_mdsmap: -- cgit v1.2.3 From 2c81ef286c42c5bfc0d0a60219b781791d4bd55c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 Jul 2020 16:22:40 +0100 Subject: ceph: remove redundant initialization of variable mds The variable mds is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3030f5585085..97539b497e4c 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -262,7 +262,7 @@ static int mds_sessions_show(struct seq_file *s, void *ptr) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_auth_client *ac = fsc->client->monc.auth; struct ceph_options *opt = fsc->client->options; - int mds = -1; + int mds; mutex_lock(&mdsc->mutex); -- cgit v1.2.3 From b748fc7a8763a5b3f8149f12c45711cd73ef8176 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 28 Jul 2020 10:34:20 -0400 Subject: ceph: set sec_context xattr on symlink creation Symlink inodes should have the security context set in their xattrs on creation. We already set the context on creation, but we don't attach the pagelist. The effect is that symlink inodes don't get an SELinux context set on them at creation, so they end up unlabeled instead of inheriting the proper context. Make it do so. Cc: stable@vger.kernel.org Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 39f5311404b0..060bdcc5ce32 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -930,6 +930,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; + } err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); -- cgit v1.2.3 From a0102bda5bc0991c5c8c7c07770b236894a810fd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 30 Jul 2020 11:03:55 -0400 Subject: ceph: move sb->wb_pagevec_pool to be a global mempool When doing some testing recently, I hit some page allocation failures on mount, when creating the wb_pagevec_pool for the mount. That requires 128k (32 contiguous pages), and after thrashing the memory during an xfstests run, sometimes that would fail. 128k for each mount seems like a lot to hold in reserve for a rainy day, so let's change this to a global mempool that gets allocated when the module is plugged in. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 23 +++++++++++------------ fs/ceph/super.c | 22 ++++++++-------------- fs/ceph/super.h | 2 -- include/linux/ceph/libceph.h | 1 + 4 files changed, 20 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 01ad09733ac7..6ea761c84494 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -862,8 +862,7 @@ static void writepages_finish(struct ceph_osd_request *req) osd_data = osd_req_op_extent_osd_data(req, 0); if (osd_data->pages_from_pool) - mempool_free(osd_data->pages, - ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); + mempool_free(osd_data->pages, ceph_wb_pagevec_pool); else kfree(osd_data->pages); ceph_osdc_put_request(req); @@ -955,10 +954,10 @@ retry: int num_ops = 0, op_idx; unsigned i, pvec_pages, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; - mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; pgoff_t strip_unit_end = 0; u64 offset = 0, len = 0; + bool from_pool = false; max_pages = wsize >> PAGE_SHIFT; @@ -1057,16 +1056,16 @@ get_more_pages: sizeof(*pages), GFP_NOFS); if (!pages) { - pool = fsc->wb_pagevec_pool; - pages = mempool_alloc(pool, GFP_NOFS); + from_pool = true; + pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); BUG_ON(!pages); } len = 0; } else if (page->index != (offset + len) >> PAGE_SHIFT) { - if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS : - CEPH_OSD_MAX_OPS)) { + if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : + CEPH_OSD_MAX_OPS)) { redirty_page_for_writepage(wbc, page); unlock_page(page); break; @@ -1161,7 +1160,7 @@ new_request: offset, len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, - !!pool, false); + from_pool, false); osd_req_op_extent_update(req, op_idx, len); len = 0; @@ -1188,12 +1187,12 @@ new_request: dout("writepages got pages at %llu~%llu\n", offset, len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, - 0, !!pool, false); + 0, from_pool, false); osd_req_op_extent_update(req, op_idx, len); BUG_ON(op_idx + 1 != req->r_num_ops); - pool = NULL; + from_pool = false; if (i < locked_pages) { BUG_ON(num_ops <= req->r_num_ops); num_ops -= req->r_num_ops; @@ -1204,8 +1203,8 @@ new_request: pages = kmalloc_array(locked_pages, sizeof(*pages), GFP_NOFS); if (!pages) { - pool = fsc->wb_pagevec_pool; - pages = mempool_alloc(pool, GFP_NOFS); + from_pool = true; + pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); BUG_ON(!pages); } memcpy(pages, data_pages + i, diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 585aecea5cad..7ec0e6d03d10 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -637,8 +637,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - int page_count; - size_t size; int err; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); @@ -686,22 +684,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->cap_wq) goto fail_inode_wq; - /* set up mempools */ - err = -ENOMEM; - page_count = fsc->mount_options->wsize >> PAGE_SHIFT; - size = sizeof (struct page *) * (page_count ? page_count : 1); - fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); - if (!fsc->wb_pagevec_pool) - goto fail_cap_wq; - spin_lock(&ceph_fsc_lock); list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); spin_unlock(&ceph_fsc_lock); return fsc; -fail_cap_wq: - destroy_workqueue(fsc->cap_wq); fail_inode_wq: destroy_workqueue(fsc->inode_wq); fail_client: @@ -732,8 +720,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); - mempool_destroy(fsc->wb_pagevec_pool); - destroy_mount_options(fsc->mount_options); ceph_destroy_client(fsc->client); @@ -752,6 +738,7 @@ struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; struct kmem_cache *ceph_dir_file_cachep; struct kmem_cache *ceph_mds_request_cachep; +mempool_t *ceph_wb_pagevec_pool; static void ceph_inode_init_once(void *foo) { @@ -796,6 +783,10 @@ static int __init init_caches(void) if (!ceph_mds_request_cachep) goto bad_mds_req; + ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT); + if (!ceph_wb_pagevec_pool) + goto bad_pagevec_pool; + error = ceph_fscache_register(); if (error) goto bad_fscache; @@ -804,6 +795,8 @@ static int __init init_caches(void) bad_fscache: kmem_cache_destroy(ceph_mds_request_cachep); +bad_pagevec_pool: + mempool_destroy(ceph_wb_pagevec_pool); bad_mds_req: kmem_cache_destroy(ceph_dir_file_cachep); bad_dir_file: @@ -834,6 +827,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_file_cachep); kmem_cache_destroy(ceph_dir_file_cachep); kmem_cache_destroy(ceph_mds_request_cachep); + mempool_destroy(ceph_wb_pagevec_pool); ceph_fscache_unregister(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 9001a896ae8c..4c3c964b1c54 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -118,8 +118,6 @@ struct ceph_fs_client { struct ceph_mds_client *mdsc; - /* writeback */ - mempool_t *wb_pagevec_pool; atomic_long_t writeback_count; struct workqueue_struct *inode_wq; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index e5ed1c541e7f..c8645f0b797d 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -282,6 +282,7 @@ extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; extern struct kmem_cache *ceph_dir_file_cachep; extern struct kmem_cache *ceph_mds_request_cachep; +extern mempool_t *ceph_wb_pagevec_pool; /* ceph_common.c */ extern bool libceph_compatible(void *data); -- cgit v1.2.3 From 224c7b6778fe08e1880ef88867051bec0a154d6c Mon Sep 17 00:00:00 2001 From: Yanhu Cao Date: Fri, 31 Jul 2020 16:25:13 +0800 Subject: ceph: use frag's MDS in either mode When doing some tests with multiple mds, we were seeing many mds forwarding requests between them, causing clients to resend. If the request is a modification operation and the mode is set to USE_AUTH_MDS, then the auth mds should be selected to handle the request. If auth mds for frag is already set, then it should be returned directly without further processing. The current logic is wrong because it only returns directly if mode is USE_AUTH_MDS, but we want to do that for all modes. If we don't, then when the frag's mds is not equal to cap session's mds, the request will get sent to the wrong MDS needlessly. Drop the mode check in this condition. Signed-off-by: Yanhu Cao Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 590822fab767..1095802ad9bd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1103,8 +1103,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) { - if (mode == USE_ANY_MDS && - !ceph_mdsmap_is_laggy(mdsc->mdsmap, + if (!ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) goto out; } -- cgit v1.2.3 From 02e37571f9e79022498fd0525c073b07e9d9ac69 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 4 Aug 2020 12:31:56 -0400 Subject: ceph: handle zero-length feature mask in session messages Most session messages contain a feature mask, but the MDS will routinely send a REJECT message with one that is zero-length. Commit 0fa8263367db ("ceph: fix endianness bug when handling MDS session feature bits") fixed the decoding of the feature mask, but failed to account for the MDS sending a zero-length feature mask. This causes REJECT message decoding to fail. Skip trying to decode a feature mask if the word count is zero. Cc: stable@vger.kernel.org URL: https://tracker.ceph.com/issues/46823 Fixes: 0fa8263367db ("ceph: fix endianness bug when handling MDS session feature bits") Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Tested-by: Patrick Donnelly Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1095802ad9bd..4a26862d7667 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3358,8 +3358,10 @@ static void handle_session(struct ceph_mds_session *session, goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_64_safe(&p, end, features, bad); - p += len - sizeof(features); + if (len) { + ceph_decode_64_safe(&p, end, features, bad); + p += len - sizeof(features); + } } mutex_lock(&mdsc->mutex); -- cgit v1.2.3