From 03bc06c7b0bd8d86b9f17f459acaeb1283ba2700 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 26 Feb 2022 06:33:03 -0500 Subject: ceph: add new mount option to enable sparse reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new mount option that has the client issue sparse reads instead of normal ones. The callers now preallocate an sparse extent buffer that the libceph receive code can populate and hand back after the operation completes. After a successful sparse read, we can't use the req->r_result value to determine the amount of data "read", so instead we set the received length to be from the end of the last extent in the buffer. Any interstitial holes will have been filled by the receive code. [ xiubli: fix a double free on req reported by Ilya ] Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a5f52013314d..d6a1790f6923 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -165,6 +165,7 @@ enum { Opt_copyfrom, Opt_wsync, Opt_pagecache, + Opt_sparseread, }; enum ceph_recover_session_mode { @@ -207,6 +208,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), fsparam_flag_no ("pagecache", Opt_pagecache), + fsparam_flag_no ("sparseread", Opt_sparseread), {} }; @@ -576,6 +578,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; break; + case Opt_sparseread: + if (result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD; + else + fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD; + break; default: BUG(); } @@ -710,9 +718,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); - if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) seq_puts(m, ",nopagecache"); + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + seq_puts(m, ",sparseread"); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); @@ -1296,6 +1305,11 @@ static int ceph_reconfigure_fc(struct fs_context *fc) else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + ceph_set_mount_opt(fsc, SPARSEREAD); + else + ceph_clear_mount_opt(fsc, SPARSEREAD); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { kfree(fsc->mount_options->mon_addr); fsc->mount_options->mon_addr = fsopt->mon_addr; -- cgit v1.2.3 From 2d332d5bc424404911540006a8bb450fbb96b178 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 27 Jul 2020 10:16:09 -0400 Subject: ceph: fscrypt_auth handling for ceph MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most fscrypt-enabled filesystems store the crypto context in an xattr, but that's problematic for ceph as xatts are governed by the XATTR cap, but we really want the crypto context as part of the AUTH cap. Because of this, the MDS has added two new inode metadata fields: fscrypt_auth and fscrypt_file. The former is used to hold the crypto context, and the latter is used to track the real file size. Parse new fscrypt_auth and fscrypt_file fields in inode traces. For now, we don't use fscrypt_file, but fscrypt_auth is used to hold the fscrypt context. Allow the client to use a setattr request for setting the fscrypt_auth field. Since this is not a standard setattr request from the VFS, we add a new field to __ceph_setattr that carries ceph-specific inode attrs. Have the set_context op do a setattr that sets the fscrypt_auth value, and get_context just return the contents of that field (since it should always be available). Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/Makefile | 1 + fs/ceph/acl.c | 4 +- fs/ceph/caps.c | 78 ++++++++++++++++++++++++----- fs/ceph/crypto.c | 77 +++++++++++++++++++++++++++++ fs/ceph/crypto.h | 36 ++++++++++++++ fs/ceph/inode.c | 64 +++++++++++++++++++++++- fs/ceph/mds_client.c | 114 +++++++++++++++++++++++++++++++++++++++---- fs/ceph/mds_client.h | 7 +++ fs/ceph/super.c | 3 ++ fs/ceph/super.h | 15 +++++- include/linux/ceph/ceph_fs.h | 21 +++++--- 11 files changed, 385 insertions(+), 35 deletions(-) create mode 100644 fs/ceph/crypto.c create mode 100644 fs/ceph/crypto.h (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 50c635dc7f71..1f77ca04c426 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -12,3 +12,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ ceph-$(CONFIG_CEPH_FSCACHE) += cache.o ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o +ceph-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 6945a938d396..8a56f979c7cb 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -140,7 +140,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - ret = __ceph_setattr(inode, &newattrs); + ret = __ceph_setattr(inode, &newattrs, NULL); if (ret) goto out_free; } @@ -151,7 +151,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, newattrs.ia_ctime = old_ctime; newattrs.ia_mode = old_mode; newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - __ceph_setattr(inode, &newattrs); + __ceph_setattr(inode, &newattrs, NULL); } goto out_free; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e2bb0d0072da..1c62ef339bc6 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -14,6 +14,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include #include @@ -1216,15 +1217,12 @@ struct cap_msg_args { umode_t mode; bool inline_data; bool wake; + u32 fscrypt_auth_len; + u32 fscrypt_file_len; + u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context + u8 fscrypt_file[sizeof(u64)]; // for size }; -/* - * cap struct size + flock buffer size + inline version + inline data size + - * osd_epoch_barrier + oldest_flush_tid - */ -#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ - 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) - /* Marshal up the cap msg to the MDS */ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { @@ -1240,7 +1238,7 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); - msg->hdr.version = cpu_to_le16(10); + msg->hdr.version = cpu_to_le16(12); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; @@ -1311,6 +1309,21 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); + + /* dirstats (version 11) - these are r/o on the client */ + ceph_encode_64(&p, 0); + ceph_encode_64(&p, 0); + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + /* fscrypt_auth and fscrypt_file (version 12) */ + ceph_encode_32(&p, arg->fscrypt_auth_len); + ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); + ceph_encode_32(&p, arg->fscrypt_file_len); + ceph_encode_copy(&p, arg->fscrypt_file, arg->fscrypt_file_len); +#else /* CONFIG_FS_ENCRYPTION */ + ceph_encode_32(&p, 0); + ceph_encode_32(&p, 0); +#endif /* CONFIG_FS_ENCRYPTION */ } /* @@ -1432,7 +1445,37 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, } } arg->flags = flags; +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len && + WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { + /* Don't set this if it's too big */ + arg->fscrypt_auth_len = 0; + } else { + arg->fscrypt_auth_len = ci->fscrypt_auth_len; + memcpy(arg->fscrypt_auth, ci->fscrypt_auth, + min_t(size_t, ci->fscrypt_auth_len, + sizeof(arg->fscrypt_auth))); + } + /* FIXME: use this to track "real" size */ + arg->fscrypt_file_len = 0; +#endif /* CONFIG_FS_ENCRYPTION */ +} + +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len + + arg->fscrypt_file_len; } +#else +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS; +} +#endif /* CONFIG_FS_ENCRYPTION */ /* * Send a cap msg on the given inode. @@ -1444,7 +1487,8 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) struct ceph_msg *msg; struct inode *inode = &ci->netfs.inode; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, + false); if (!msg) { pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), @@ -1470,10 +1514,6 @@ static inline int __send_flush_snap(struct inode *inode, struct cap_msg_args arg; struct ceph_msg *msg; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); - if (!msg) - return -ENOMEM; - arg.session = session; arg.ino = ceph_vino(inode).ino; arg.cid = 0; @@ -1511,6 +1551,18 @@ static inline int __send_flush_snap(struct inode *inode, arg.flags = 0; arg.wake = false; + /* + * No fscrypt_auth changes from a capsnap. It will need + * to update fscrypt_file on size changes (TODO). + */ + arg.fscrypt_auth_len = 0; + arg.fscrypt_file_len = 0; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), + GFP_NOFS, false); + if (!msg) + return -ENOMEM; + encode_cap_msg(msg, &arg); ceph_con_send(&arg.session->s_con, msg); return 0; diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c new file mode 100644 index 000000000000..b17a6ee16cf0 --- /dev/null +++ b/fs/ceph/crypto.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "super.h" +#include "crypto.h" + +static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fscrypt_auth *cfa = (struct ceph_fscrypt_auth *)ci->fscrypt_auth; + u32 ctxlen; + + /* Non existent or too short? */ + if (!cfa || (ci->fscrypt_auth_len < (offsetof(struct ceph_fscrypt_auth, cfa_blob) + 1))) + return -ENOBUFS; + + /* Some format we don't recognize? */ + if (le32_to_cpu(cfa->cfa_version) != CEPH_FSCRYPT_AUTH_VERSION) + return -ENOBUFS; + + ctxlen = le32_to_cpu(cfa->cfa_blob_len); + if (len < ctxlen) + return -ERANGE; + + memcpy(ctx, cfa->cfa_blob, ctxlen); + return ctxlen; +} + +static int ceph_crypt_set_context(struct inode *inode, const void *ctx, + size_t len, void *fs_data) +{ + int ret; + struct iattr attr = { }; + struct ceph_iattr cia = { }; + struct ceph_fscrypt_auth *cfa; + + WARN_ON_ONCE(fs_data); + + if (len > FSCRYPT_SET_CONTEXT_MAX_SIZE) + return -EINVAL; + + cfa = kzalloc(sizeof(*cfa), GFP_KERNEL); + if (!cfa) + return -ENOMEM; + + cfa->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + cfa->cfa_blob_len = cpu_to_le32(len); + memcpy(cfa->cfa_blob, ctx, len); + + cia.fscrypt_auth = cfa; + + ret = __ceph_setattr(inode, &attr, &cia); + if (ret == 0) + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + kfree(cia.fscrypt_auth); + return ret; +} + +static bool ceph_crypt_empty_dir(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + return ci->i_rsubdirs + ci->i_rfiles == 1; +} + +static struct fscrypt_operations ceph_fscrypt_ops = { + .get_context = ceph_crypt_get_context, + .set_context = ceph_crypt_set_context, + .empty_dir = ceph_crypt_empty_dir, +}; + +void ceph_fscrypt_set_ops(struct super_block *sb) +{ + fscrypt_set_ops(sb, &ceph_fscrypt_ops); +} diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h new file mode 100644 index 000000000000..6dca674f79b8 --- /dev/null +++ b/fs/ceph/crypto.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ceph fscrypt functionality + */ + +#ifndef _CEPH_CRYPTO_H +#define _CEPH_CRYPTO_H + +#include + +struct ceph_fscrypt_auth { + __le32 cfa_version; + __le32 cfa_blob_len; + u8 cfa_blob[FSCRYPT_SET_CONTEXT_MAX_SIZE]; +} __packed; + +#define CEPH_FSCRYPT_AUTH_VERSION 1 +static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa) +{ + u32 ctxsize = le32_to_cpu(fa->cfa_blob_len); + + return offsetof(struct ceph_fscrypt_auth, cfa_blob) + ctxsize; +} + +#ifdef CONFIG_FS_ENCRYPTION +void ceph_fscrypt_set_ops(struct super_block *sb); + +#else /* CONFIG_FS_ENCRYPTION */ + +static inline void ceph_fscrypt_set_ops(struct super_block *sb) +{ +} + +#endif /* CONFIG_FS_ENCRYPTION */ + +#endif diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e23172024707..a3aa7870a6a2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -14,10 +14,12 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include /* @@ -617,6 +619,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); +#ifdef CONFIG_FS_ENCRYPTION + ci->fscrypt_auth = NULL; + ci->fscrypt_auth_len = 0; +#endif return &ci->netfs.inode; } @@ -625,6 +631,9 @@ void ceph_free_inode(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); kfree(ci->i_symlink); +#ifdef CONFIG_FS_ENCRYPTION + kfree(ci->fscrypt_auth); +#endif kmem_cache_free(ceph_inode_cachep, ci); } @@ -645,6 +654,7 @@ void ceph_evict_inode(struct inode *inode) clear_inode(inode); ceph_fscache_unregister_inode_cookie(ci); + fscrypt_put_encryption_info(inode); __ceph_remove_caps(ci); @@ -935,6 +945,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); +#ifdef CONFIG_FS_ENCRYPTION + if (iinfo->fscrypt_auth_len && (inode->i_state & I_NEW)) { + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; + ci->fscrypt_auth = iinfo->fscrypt_auth; + iinfo->fscrypt_auth = NULL; + iinfo->fscrypt_auth_len = 0; + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + } +#endif + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = mode; @@ -2079,7 +2100,8 @@ static const struct inode_operations ceph_symlink_iops = { .listxattr = ceph_listxattr, }; -int __ceph_setattr(struct inode *inode, struct iattr *attr) +int __ceph_setattr(struct inode *inode, struct iattr *attr, + struct ceph_iattr *cia) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned int ia_valid = attr->ia_valid; @@ -2119,6 +2141,43 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (cia && cia->fscrypt_auth) { + u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth); + + if (len > sizeof(*cia->fscrypt_auth)) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } + + dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n", + ceph_vinop(inode), ci->fscrypt_auth_len, len); + + /* It should never be re-set once set */ + WARN_ON_ONCE(ci->fscrypt_auth); + + if (issued & CEPH_CAP_AUTH_EXCL) { + dirtied |= CEPH_CAP_AUTH_EXCL; + kfree(ci->fscrypt_auth); + ci->fscrypt_auth = (u8 *)cia->fscrypt_auth; + ci->fscrypt_auth_len = len; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + ci->fscrypt_auth_len != len || + memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) { + req->r_fscrypt_auth = cia->fscrypt_auth; + mask |= CEPH_SETATTR_FSCRYPT_AUTH; + release |= CEPH_CAP_AUTH_SHARED; + } + cia->fscrypt_auth = NULL; + } +#else + if (cia && cia->fscrypt_auth) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } +#endif /* CONFIG_FS_ENCRYPTION */ if (ia_valid & ATTR_UID) { dout("setattr %p uid %d -> %d\n", inode, @@ -2282,6 +2341,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_stamp = attr->ia_ctime; err = ceph_mdsc_do_request(mdsc, NULL, req); } +out: dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); @@ -2322,7 +2382,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) return -EDQUOT; - err = __ceph_setattr(inode, attr); + err = __ceph_setattr(inode, attr, NULL); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 77b5e7b2e5dd..c3927dab5a3b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -15,6 +15,7 @@ #include "super.h" #include "mds_client.h" +#include "crypto.h" #include #include @@ -184,8 +185,54 @@ static int parse_reply_info_in(void **p, void *end, info->rsnaps = 0; } + if (struct_v >= 5) { + u32 alen; + + ceph_decode_32_safe(p, end, alen, bad); + + while (alen--) { + u32 len; + + /* key */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + /* value */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + } + } + + /* fscrypt flag -- ignore */ + if (struct_v >= 6) + ceph_decode_skip_8(p, end, bad); + + info->fscrypt_auth = NULL; + info->fscrypt_auth_len = 0; + info->fscrypt_file = NULL; + info->fscrypt_file_len = 0; + if (struct_v >= 7) { + ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad); + if (info->fscrypt_auth_len) { + info->fscrypt_auth = kmalloc(info->fscrypt_auth_len, + GFP_KERNEL); + if (!info->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_auth, + info->fscrypt_auth_len, bad); + } + ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad); + if (info->fscrypt_file_len) { + info->fscrypt_file = kmalloc(info->fscrypt_file_len, + GFP_KERNEL); + if (!info->fscrypt_file) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_file, + info->fscrypt_file_len, bad); + } + } *p = end; } else { + /* legacy (unversioned) struct */ if (features & CEPH_FEATURE_MDS_INLINE_DATA) { ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); @@ -651,8 +698,21 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { + int i; + + kfree(info->diri.fscrypt_auth); + kfree(info->diri.fscrypt_file); + kfree(info->targeti.fscrypt_auth); + kfree(info->targeti.fscrypt_file); if (!info->dir_entries) return; + + for (i = 0; i < info->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + + kfree(rde->inode.fscrypt_auth); + kfree(rde->inode.fscrypt_file); + } free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } @@ -966,6 +1026,7 @@ void ceph_mdsc_release_request(struct kref *kref) put_cred(req->r_cred); if (req->r_pagelist) ceph_pagelist_release(req->r_pagelist); + kfree(req->r_fscrypt_auth); put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); WARN_ON_ONCE(!list_empty(&req->r_wait)); @@ -2543,8 +2604,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, return r; } -static void encode_timestamp_and_gids(void **p, - const struct ceph_mds_request *req) +static void encode_mclientrequest_tail(void **p, + const struct ceph_mds_request *req) { struct ceph_timespec ts; int i; @@ -2557,6 +2618,20 @@ static void encode_timestamp_and_gids(void **p, for (i = 0; i < req->r_cred->group_info->ngroups; i++) ceph_encode_64(p, from_kgid(&init_user_ns, req->r_cred->group_info->gid[i])); + + /* v5: altname (TODO: skip for now) */ + ceph_encode_32(p, 0); + + /* v6: fscrypt_auth and fscrypt_file */ + if (req->r_fscrypt_auth) { + u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + ceph_encode_32(p, authlen); + ceph_encode_copy(p, req->r_fscrypt_auth, authlen); + } else { + ceph_encode_32(p, 0); + } + ceph_encode_32(p, 0); // fscrypt_file for now } /* @@ -2605,12 +2680,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, goto out_free1; } + /* head */ len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); - len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + - sizeof(struct ceph_timespec); - len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); - /* calculate (max) length for cap releases */ + /* filepaths */ + len += 2 * (1 + sizeof(u32) + sizeof(u64)); + len += pathlen1 + pathlen2; + + /* cap releases */ len += sizeof(struct ceph_mds_request_release) * (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); @@ -2620,6 +2697,25 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, if (req->r_old_dentry_drop) len += pathlen2; + /* MClientRequest tail */ + + /* req->r_stamp */ + len += sizeof(struct ceph_timespec); + + /* gid list */ + len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); + + /* alternate name */ + len += sizeof(u32); // TODO + + /* fscrypt_auth */ + len += sizeof(u32); // fscrypt_auth + if (req->r_fscrypt_auth) + len += ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + /* fscrypt_file */ + len += sizeof(u32); + msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); @@ -2639,7 +2735,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, } else { struct ceph_mds_request_head *new_head = msg->front.iov_base; - msg->hdr.version = cpu_to_le16(4); + msg->hdr.version = cpu_to_le16(6); new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; p = msg->front.iov_base + sizeof(*new_head); @@ -2690,7 +2786,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, head->num_releases = cpu_to_le16(releases); - encode_timestamp_and_gids(&p, req); + encode_mclientrequest_tail(&p, req); if (WARN_ON_ONCE(p > end)) { ceph_msg_put(msg); @@ -2820,7 +2916,7 @@ static int __prepare_send_request(struct ceph_mds_session *session, rhead->num_releases = 0; p = msg->front.iov_base + req->r_request_release_offset; - encode_timestamp_and_gids(&p, req); + encode_mclientrequest_tail(&p, req); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3c1aa99c1876..a2e85fb5aab1 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -86,6 +86,10 @@ struct ceph_mds_reply_info_in { s32 dir_pin; struct ceph_timespec btime; struct ceph_timespec snap_btime; + u8 *fscrypt_auth; + u8 *fscrypt_file; + u32 fscrypt_auth_len; + u32 fscrypt_file_len; u64 rsnaps; u64 change_attr; }; @@ -278,6 +282,9 @@ struct ceph_mds_request { struct mutex r_fill_mutex; union ceph_mds_request_args r_args; + + struct ceph_fscrypt_auth *r_fscrypt_auth; + int r_fmode; /* file mode, if expecting cap */ int r_request_release_offset; const struct cred *r_cred; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d6a1790f6923..c4ab2db85ef0 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -20,6 +20,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include #include @@ -1135,6 +1136,8 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) s->s_time_max = U32_MAX; s->s_flags |= SB_NODIRATIME | SB_NOATIME; + ceph_fscrypt_set_ops(s); + ret = set_anon_super_fc(s, fc); if (ret != 0) fsc->sb = NULL; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index eb621bb276bd..3a39a9b3bc33 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -450,6 +450,13 @@ struct ceph_inode_info { struct work_struct i_work; unsigned long i_work_mask; + +#ifdef CONFIG_FS_ENCRYPTION + u32 fscrypt_auth_len; + u32 fscrypt_file_len; + u8 *fscrypt_auth; + u8 *fscrypt_file; +#endif }; struct ceph_netfs_request_data { @@ -1073,7 +1080,13 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) } extern int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -extern int __ceph_setattr(struct inode *inode, struct iattr *attr); + +struct ceph_iattr { + struct ceph_fscrypt_auth *fscrypt_auth; +}; + +extern int __ceph_setattr(struct inode *inode, struct iattr *attr, + struct ceph_iattr *cia); extern int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct mnt_idmap *idmap, diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 49586ff26152..45f8ce61e103 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -359,14 +359,19 @@ enum { extern const char *ceph_mds_op_name(int op); - -#define CEPH_SETATTR_MODE 1 -#define CEPH_SETATTR_UID 2 -#define CEPH_SETATTR_GID 4 -#define CEPH_SETATTR_MTIME 8 -#define CEPH_SETATTR_ATIME 16 -#define CEPH_SETATTR_SIZE 32 -#define CEPH_SETATTR_CTIME 64 +#define CEPH_SETATTR_MODE (1 << 0) +#define CEPH_SETATTR_UID (1 << 1) +#define CEPH_SETATTR_GID (1 << 2) +#define CEPH_SETATTR_MTIME (1 << 3) +#define CEPH_SETATTR_ATIME (1 << 4) +#define CEPH_SETATTR_SIZE (1 << 5) +#define CEPH_SETATTR_CTIME (1 << 6) +#define CEPH_SETATTR_MTIME_NOW (1 << 7) +#define CEPH_SETATTR_ATIME_NOW (1 << 8) +#define CEPH_SETATTR_BTIME (1 << 9) +#define CEPH_SETATTR_KILL_SGUID (1 << 10) +#define CEPH_SETATTR_FSCRYPT_AUTH (1 << 11) +#define CEPH_SETATTR_FSCRYPT_FILE (1 << 12) /* * Ceph setxattr request flags. -- cgit v1.2.3 From 6b5717bd30ab7f35792d20b71211055bdb43e6de Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 8 Sep 2020 09:47:40 -0400 Subject: ceph: implement -o test_dummy_encryption mount option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the test_dummy_encryption mount option. This allows us to test the encrypted codepaths in ceph without having to manually set keys, etc. [ lhenriques: fix potential fsc->fsc_dummy_enc_policy memory leak in ceph_real_mount() ] Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/crypto.c | 56 ++++++++++++++++++++++++++++++++++++ fs/ceph/crypto.h | 28 ++++++++++++++++++ fs/ceph/inode.c | 10 +++++-- fs/ceph/super.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/ceph/super.h | 9 +++++- fs/ceph/xattr.c | 3 ++ 6 files changed, 186 insertions(+), 6 deletions(-) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index b17a6ee16cf0..b65e31c5c503 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -4,6 +4,7 @@ #include #include "super.h" +#include "mds_client.h" #include "crypto.h" static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len) @@ -65,9 +66,15 @@ static bool ceph_crypt_empty_dir(struct inode *inode) return ci->i_rsubdirs + ci->i_rfiles == 1; } +static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) +{ + return ceph_sb_to_client(sb)->fsc_dummy_enc_policy.policy; +} + static struct fscrypt_operations ceph_fscrypt_ops = { .get_context = ceph_crypt_get_context, .set_context = ceph_crypt_set_context, + .get_dummy_policy = ceph_get_dummy_policy, .empty_dir = ceph_crypt_empty_dir, }; @@ -75,3 +82,52 @@ void ceph_fscrypt_set_ops(struct super_block *sb) { fscrypt_set_ops(sb, &ceph_fscrypt_ops); } + +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ + fscrypt_free_dummy_policy(&fsc->fsc_dummy_enc_policy); +} + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + int ret, ctxsize; + bool encrypted = false; + struct ceph_inode_info *ci = ceph_inode(inode); + + ret = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (ret) + return ret; + if (!encrypted) + return 0; + + as->fscrypt_auth = kzalloc(sizeof(*as->fscrypt_auth), GFP_KERNEL); + if (!as->fscrypt_auth) + return -ENOMEM; + + ctxsize = fscrypt_context_for_new_inode(as->fscrypt_auth->cfa_blob, + inode); + if (ctxsize < 0) + return ctxsize; + + as->fscrypt_auth->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + as->fscrypt_auth->cfa_blob_len = cpu_to_le32(ctxsize); + + WARN_ON_ONCE(ci->fscrypt_auth); + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = ceph_fscrypt_auth_len(as->fscrypt_auth); + ci->fscrypt_auth = kmemdup(as->fscrypt_auth, ci->fscrypt_auth_len, + GFP_KERNEL); + if (!ci->fscrypt_auth) + return -ENOMEM; + + inode->i_flags |= S_ENCRYPTED; + + return 0; +} + +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as) +{ + swap(req->r_fscrypt_auth, as->fscrypt_auth); +} diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h index 6dca674f79b8..642963f5442e 100644 --- a/fs/ceph/crypto.h +++ b/fs/ceph/crypto.h @@ -8,6 +8,10 @@ #include +struct ceph_fs_client; +struct ceph_acl_sec_ctx; +struct ceph_mds_request; + struct ceph_fscrypt_auth { __le32 cfa_version; __le32 cfa_blob_len; @@ -25,12 +29,36 @@ static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa) #ifdef CONFIG_FS_ENCRYPTION void ceph_fscrypt_set_ops(struct super_block *sb); +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc); + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as); +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as); + #else /* CONFIG_FS_ENCRYPTION */ static inline void ceph_fscrypt_set_ops(struct super_block *sb) { } +static inline void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ +} + +static inline int ceph_fscrypt_prepare_context(struct inode *dir, + struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + if (IS_ENCRYPTED(dir)) + return -EOPNOTSUPP; + return 0; +} + +static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ +} #endif /* CONFIG_FS_ENCRYPTION */ #endif diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a3aa7870a6a2..50ff641865e0 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -83,12 +83,17 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, goto out_err; } + inode->i_state = 0; + inode->i_mode = *mode; + err = ceph_security_init_secctx(dentry, *mode, as_ctx); if (err < 0) goto out_err; - inode->i_state = 0; - inode->i_mode = *mode; + err = ceph_fscrypt_prepare_context(dir, inode, as_ctx); + if (err) + goto out_err; + return inode; out_err: iput(inode); @@ -102,6 +107,7 @@ void ceph_as_ctx_to_req(struct ceph_mds_request *req, req->r_pagelist = as_ctx->pagelist; as_ctx->pagelist = NULL; } + ceph_fscrypt_as_ctx_to_req(req, as_ctx); } /** diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c4ab2db85ef0..75dd1b6b3d01 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -47,6 +47,7 @@ static void ceph_put_super(struct super_block *s) struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("put_super\n"); + ceph_fscrypt_free_dummy_policy(fsc); ceph_mdsc_close_sessions(fsc->mdsc); } @@ -152,6 +153,7 @@ enum { Opt_recover_session, Opt_source, Opt_mon_addr, + Opt_test_dummy_encryption, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -194,6 +196,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("fsc", Opt_fscache), // fsc=... fsparam_flag_no ("ino32", Opt_ino32), fsparam_string ("mds_namespace", Opt_mds_namespace), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_flag_no ("poolperm", Opt_poolperm), fsparam_flag_no ("quotadf", Opt_quotadf), fsparam_u32 ("rasize", Opt_rasize), @@ -205,7 +208,8 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), - fsparam_string ("mon_addr", Opt_mon_addr), + fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), fsparam_flag_no ("pagecache", Opt_pagecache), @@ -585,6 +589,23 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD; break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_FS_ENCRYPTION + fscrypt_free_dummy_policy(&fsopt->dummy_enc_policy); + ret = fscrypt_parse_test_dummy_encryption(param, + &fsopt->dummy_enc_policy); + if (ret == -EINVAL) { + warnfc(fc, "Value of option \"%s\" is unrecognized", + param->key); + } else if (ret == -EEXIST) { + warnfc(fc, "Conflicting test_dummy_encryption options"); + ret = -EINVAL; + } +#else + warnfc(fc, + "FS encryption not supported: test_dummy_encryption mount option ignored"); +#endif + break; default: BUG(); } @@ -605,6 +626,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->server_path); kfree(args->fscache_uniq); kfree(args->mon_addr); + fscrypt_free_dummy_policy(&args->dummy_enc_policy); kfree(args); } @@ -724,6 +746,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) seq_puts(m, ",sparseread"); + fscrypt_show_test_dummy_encryption(m, ',', root->d_sb); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -1062,6 +1086,50 @@ out: return root; } +#ifdef CONFIG_FS_ENCRYPTION +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + struct ceph_fs_client *fsc = sb->s_fs_info; + + if (!fscrypt_is_dummy_policy_set(&fsopt->dummy_enc_policy)) + return 0; + + /* No changing encryption context on remount. */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + !fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + + /* Also make sure fsopt doesn't contain a conflicting value. */ + if (fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + + fsc->fsc_dummy_enc_policy = fsopt->dummy_enc_policy; + memset(&fsopt->dummy_enc_policy, 0, sizeof(fsopt->dummy_enc_policy)); + + warnfc(fc, "test_dummy_encryption mode enabled"); + return 0; +} +#else +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + return 0; +} +#endif + /* * mount: join the ceph cluster, and open root directory. */ @@ -1090,6 +1158,11 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto out; } + err = ceph_apply_test_dummy_encryption(fsc->sb, fc, + fsc->mount_options); + if (err) + goto out; + dout("mount opening path '%s'\n", path); ceph_fs_debugfs_init(fsc); @@ -1111,6 +1184,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, out: mutex_unlock(&fsc->client->mount_mutex); + ceph_fscrypt_free_dummy_policy(fsc); return ERR_PTR(err); } @@ -1299,9 +1373,15 @@ static void ceph_free_fc(struct fs_context *fc) static int ceph_reconfigure_fc(struct fs_context *fc) { + int err; struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; - struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb); + struct super_block *sb = fc->root->d_sb; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + err = ceph_apply_test_dummy_encryption(sb, fc, fsopt); + if (err) + return err; if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) ceph_set_mount_opt(fsc, ASYNC_DIROPS); @@ -1320,7 +1400,7 @@ static int ceph_reconfigure_fc(struct fs_context *fc) pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); } - sync_filesystem(fc->root->d_sb); + sync_filesystem(sb); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3a39a9b3bc33..11483855897e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -22,6 +22,7 @@ #include #include +#include "crypto.h" /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ @@ -99,6 +100,7 @@ struct ceph_mount_options { char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ char *mon_addr; + struct fscrypt_dummy_policy dummy_enc_policy; }; /* mount state */ @@ -155,9 +157,11 @@ struct ceph_fs_client { #ifdef CONFIG_CEPH_FSCACHE struct fscache_volume *fscache; #endif +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_dummy_policy fsc_dummy_enc_policy; +#endif }; - /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -1120,6 +1124,9 @@ struct ceph_acl_sec_ctx { #ifdef CONFIG_CEPH_FS_SECURITY_LABEL void *sec_ctx; u32 sec_ctxlen; +#endif +#ifdef CONFIG_FS_ENCRYPTION + struct ceph_fscrypt_auth *fscrypt_auth; #endif struct ceph_pagelist *pagelist; }; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 806183959c47..ddc9090a1465 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1407,6 +1407,9 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) #endif #ifdef CONFIG_CEPH_FS_SECURITY_LABEL security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen); +#endif +#ifdef CONFIG_FS_ENCRYPTION + kfree(as_ctx->fscrypt_auth); #endif if (as_ctx->pagelist) ceph_pagelist_release(as_ctx->pagelist); -- cgit v1.2.3 From e3dfcab2080dc1f9a4b09cc1327361bc2845bfcd Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 21 Dec 2022 14:13:51 +0800 Subject: ceph: drop messages from MDS when unmounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When unmounting all the dirty buffers will be flushed and after the last osd request is finished the last reference of the i_count will be released. Then it will flush the dirty cap/snap to MDSs, and the unmounting won't wait the possible acks, which will ihold the inodes when updating the metadata locally but makes no sense any more, of this. This will make the evict_inodes() to skip these inodes. If encrypt is enabled the kernel generate a warning when removing the encrypt keys when the skipped inodes still hold the keyring: WARNING: CPU: 4 PID: 168846 at fs/crypto/keyring.c:242 fscrypt_destroy_keyring+0x7e/0xd0 CPU: 4 PID: 168846 Comm: umount Tainted: G S 6.1.0-rc5-ceph-g72ead199864c #1 Hardware name: Supermicro SYS-5018R-WR/X10SRW-F, BIOS 2.0 12/17/2015 RIP: 0010:fscrypt_destroy_keyring+0x7e/0xd0 RSP: 0018:ffffc9000b277e28 EFLAGS: 00010202 RAX: 0000000000000002 RBX: ffff88810d52ac00 RCX: ffff88810b56aa00 RDX: 0000000080000000 RSI: ffffffff822f3a09 RDI: ffff888108f59000 RBP: ffff8881d394fb88 R08: 0000000000000028 R09: 0000000000000000 R10: 0000000000000001 R11: 11ff4fe6834fcd91 R12: ffff8881d394fc40 R13: ffff888108f59000 R14: ffff8881d394f800 R15: 0000000000000000 FS: 00007fd83f6f1080(0000) GS:ffff88885fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f918d417000 CR3: 000000017f89a005 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: generic_shutdown_super+0x47/0x120 kill_anon_super+0x14/0x30 ceph_kill_sb+0x36/0x90 [ceph] deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x67/0xb0 exit_to_user_mode_prepare+0x23d/0x240 syscall_exit_to_user_mode+0x25/0x60 do_syscall_64+0x40/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fd83dc39e9b Later the kernel will crash when iput() the inodes and dereferencing the "sb->s_master_keys", which has been released by the generic_shutdown_super(). Link: https://tracker.ceph.com/issues/59162 Signed-off-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 6 ++++- fs/ceph/mds_client.c | 12 +++++++-- fs/ceph/mds_client.h | 11 +++++--- fs/ceph/quota.c | 14 +++++----- fs/ceph/snap.c | 10 ++++--- fs/ceph/super.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++---- fs/ceph/super.h | 3 +++ 7 files changed, 109 insertions(+), 22 deletions(-) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5c2b28ac9410..54041d9c1e25 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4247,6 +4247,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout("handle_caps from mds%d\n", session->s_mds); + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ end = msg->front.iov_base + msg->front.iov_len; if (msg->front.iov_len < sizeof(*h)) @@ -4348,7 +4351,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap, inode); mutex_lock(&session->s_mutex); - inc_session_sequence(session); dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); @@ -4457,6 +4459,8 @@ done: done_unlocked: iput(inode); out: + ceph_dec_mds_stopping_blocker(mdsc); + ceph_put_string(extra_info.pool_ns); /* Defer closing the sessions after s_mutex lock being released */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c257d75c5757..04a881343e43 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4889,6 +4889,9 @@ static void handle_lease(struct ceph_mds_client *mdsc, dout("handle_lease from mds%d\n", mds); + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) goto bad; @@ -4907,8 +4910,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, dname.len, dname.name); mutex_lock(&session->s_mutex); - inc_session_sequence(session); - if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); goto release; @@ -4970,9 +4971,13 @@ release: out: mutex_unlock(&session->s_mutex); iput(inode); + + ceph_dec_mds_stopping_blocker(mdsc); return; bad: + ceph_dec_mds_stopping_blocker(mdsc); + pr_err("corrupt lease message\n"); ceph_msg_dump(msg); } @@ -5168,6 +5173,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) } init_completion(&mdsc->safe_umount_waiters); + spin_lock_init(&mdsc->stopping_lock); + atomic_set(&mdsc->stopping_blockers, 0); + init_completion(&mdsc->stopping_waiter); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 0477388a0d1c..1fa0f78b7b79 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -399,8 +399,9 @@ struct cap_wait { }; enum { - CEPH_MDSC_STOPPING_BEGIN = 1, - CEPH_MDSC_STOPPING_FLUSHED = 2, + CEPH_MDSC_STOPPING_BEGIN = 1, + CEPH_MDSC_STOPPING_FLUSHING = 2, + CEPH_MDSC_STOPPING_FLUSHED = 3, }; /* @@ -419,7 +420,11 @@ struct ceph_mds_client { struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; int max_sessions; /* len of sessions array */ - int stopping; /* true if shutting down */ + + spinlock_t stopping_lock; /* protect snap_empty */ + int stopping; /* the stage of shutting down */ + atomic_t stopping_blockers; + struct completion stopping_waiter; atomic64_t quotarealms_count; /* # realms with quota */ /* diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 64592adfe48f..f7fcf7f08ec6 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -47,25 +47,23 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, struct inode *inode; struct ceph_inode_info *ci; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + if (msg->front.iov_len < sizeof(*h)) { pr_err("%s corrupt message mds%d len %d\n", __func__, session->s_mds, (int)msg->front.iov_len); ceph_msg_dump(msg); - return; + goto out; } - /* increment msg sequence number */ - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); - /* lookup inode */ vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { pr_warn("Failed to find inode %llu\n", vino.ino); - return; + goto out; } ci = ceph_inode(inode); @@ -78,6 +76,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, spin_unlock(&ci->i_ceph_lock); iput(inode); +out: + ceph_dec_mds_stopping_blocker(mdsc); } static struct ceph_quotarealm_inode * diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 343d738448dc..7ddc6bad77ef 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1015,6 +1015,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, int locked_rwsem = 0; bool close_sessions = false; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -1030,10 +1033,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, mds, ceph_snap_op_name(op), split, trace_len); - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); - down_write(&mdsc->snap_rwsem); locked_rwsem = 1; @@ -1151,6 +1150,7 @@ skip_inode: up_write(&mdsc->snap_rwsem); flush_snaps(mdsc); + ceph_dec_mds_stopping_blocker(mdsc); return; bad: @@ -1160,6 +1160,8 @@ out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + ceph_dec_mds_stopping_blocker(mdsc); + if (close_sessions) ceph_mdsc_close_sessions(mdsc); return; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 75dd1b6b3d01..1c14d87ed871 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1462,25 +1462,90 @@ nomem: return -ENOMEM; } +/* + * Return true if it successfully increases the blocker counter, + * or false if the mdsc is in stopping and flushed state. + */ +static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) { + spin_unlock(&mdsc->stopping_lock); + return false; + } + atomic_inc(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + return true; +} + +static void __dec_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (!atomic_dec_return(&mdsc->stopping_blockers) && + mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) + complete_all(&mdsc->stopping_waiter); + spin_unlock(&mdsc->stopping_lock); +} + +/* For metadata IO requests */ +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + mutex_unlock(&session->s_mutex); + + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_client(s); + struct ceph_mds_client *mdsc = fsc->mdsc; + bool wait; dout("kill_sb %p\n", s); - ceph_mdsc_pre_umount(fsc->mdsc); + ceph_mdsc_pre_umount(mdsc); flush_fs_workqueues(fsc); /* * Though the kill_anon_super() will finally trigger the - * sync_filesystem() anyway, we still need to do it here - * and then bump the stage of shutdown to stop the work - * queue as earlier as possible. + * sync_filesystem() anyway, we still need to do it here and + * then bump the stage of shutdown. This will allow us to + * drop any further message, which will increase the inodes' + * i_count reference counters but makes no sense any more, + * from MDSs. + * + * Without this when evicting the inodes it may fail in the + * kill_anon_super(), which will trigger a warning when + * destroying the fscrypt keyring and then possibly trigger + * a further crash in ceph module when the iput() tries to + * evict the inodes later. */ sync_filesystem(s); - fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; + spin_lock(&mdsc->stopping_lock); + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; + wait = !!atomic_read(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + + if (wait && atomic_read(&mdsc->stopping_blockers)) { + long timeleft = wait_for_completion_killable_timeout( + &mdsc->stopping_waiter, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn("umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn("umount was killed, %ld\n", timeleft); + } + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b5e54c8f010b..d8eb35d73a23 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1413,4 +1413,7 @@ extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ -- cgit v1.2.3 From 1464de9f813e35559ff049f1f1f20f1e2a31d6b8 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 9 May 2023 16:38:49 +0800 Subject: ceph: wait for OSD requests' callbacks to finish when unmounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sync_filesystem() will flush all the dirty buffer and submit the osd reqs to the osdc and then is blocked to wait for all the reqs to finish. But the when the reqs' replies come, the reqs will be removed from osdc just before the req->r_callback()s are called. Which means the sync_filesystem() will be woke up by leaving the req->r_callback()s are still running. This will be buggy when the waiter require the req->r_callback()s to release some resources before continuing. So we need to make sure the req->r_callback()s are called before removing the reqs from the osdc. WARNING: CPU: 4 PID: 168846 at fs/crypto/keyring.c:242 fscrypt_destroy_keyring+0x7e/0xd0 CPU: 4 PID: 168846 Comm: umount Tainted: G S 6.1.0-rc5-ceph-g72ead199864c #1 Hardware name: Supermicro SYS-5018R-WR/X10SRW-F, BIOS 2.0 12/17/2015 RIP: 0010:fscrypt_destroy_keyring+0x7e/0xd0 RSP: 0018:ffffc9000b277e28 EFLAGS: 00010202 RAX: 0000000000000002 RBX: ffff88810d52ac00 RCX: ffff88810b56aa00 RDX: 0000000080000000 RSI: ffffffff822f3a09 RDI: ffff888108f59000 RBP: ffff8881d394fb88 R08: 0000000000000028 R09: 0000000000000000 R10: 0000000000000001 R11: 11ff4fe6834fcd91 R12: ffff8881d394fc40 R13: ffff888108f59000 R14: ffff8881d394f800 R15: 0000000000000000 FS: 00007fd83f6f1080(0000) GS:ffff88885fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f918d417000 CR3: 000000017f89a005 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: generic_shutdown_super+0x47/0x120 kill_anon_super+0x14/0x30 ceph_kill_sb+0x36/0x90 [ceph] deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x67/0xb0 exit_to_user_mode_prepare+0x23d/0x240 syscall_exit_to_user_mode+0x25/0x60 do_syscall_64+0x40/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fd83dc39e9b We need to increase the blocker counter to make sure all the osd requests' callbacks have been finished just before calling the kill_anon_super() when unmounting. Link: https://tracker.ceph.com/issues/58126 Signed-off-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 10 ++++++++++ fs/ceph/super.c | 11 +++++++++++ fs/ceph/super.h | 2 ++ 3 files changed, 23 insertions(+) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 95ff84930dcf..f4863078f7fe 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -285,6 +285,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) } netfs_subreq_terminated(subreq, err, false); iput(req->r_inode); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) @@ -414,6 +415,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) } else { osd_req_op_extent_osd_iter(req, 0, &iter); } + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + err = -EIO; + goto out; + } req->r_callback = finish_netfs_read; req->r_priv = subreq; req->r_inode = inode; @@ -910,6 +915,7 @@ static void writepages_finish(struct ceph_osd_request *req) else kfree(osd_data->pages); ceph_osdc_put_request(req); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } /* @@ -1218,6 +1224,10 @@ new_request: BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + thp_size(pages[locked_pages - 1]) - offset); + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto release_folios; + } req->r_callback = writepages_finish; req->r_inode = inode; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 1c14d87ed871..2d7f5a8d4a92 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1503,6 +1503,17 @@ void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc) __dec_stopping_blocker(mdsc); } +/* For data IO requests */ +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_client(s); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d8eb35d73a23..51c7f2b14f6f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1416,4 +1416,6 @@ extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc); +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc); +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ -- cgit v1.2.3