From 2d332d5bc424404911540006a8bb450fbb96b178 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 27 Jul 2020 10:16:09 -0400 Subject: ceph: fscrypt_auth handling for ceph MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most fscrypt-enabled filesystems store the crypto context in an xattr, but that's problematic for ceph as xatts are governed by the XATTR cap, but we really want the crypto context as part of the AUTH cap. Because of this, the MDS has added two new inode metadata fields: fscrypt_auth and fscrypt_file. The former is used to hold the crypto context, and the latter is used to track the real file size. Parse new fscrypt_auth and fscrypt_file fields in inode traces. For now, we don't use fscrypt_file, but fscrypt_auth is used to hold the fscrypt context. Allow the client to use a setattr request for setting the fscrypt_auth field. Since this is not a standard setattr request from the VFS, we add a new field to __ceph_setattr that carries ceph-specific inode attrs. Have the set_context op do a setattr that sets the fscrypt_auth value, and get_context just return the contents of that field (since it should always be available). Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 13 deletions(-) (limited to 'fs/ceph/caps.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e2bb0d0072da..1c62ef339bc6 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -14,6 +14,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include #include @@ -1216,15 +1217,12 @@ struct cap_msg_args { umode_t mode; bool inline_data; bool wake; + u32 fscrypt_auth_len; + u32 fscrypt_file_len; + u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context + u8 fscrypt_file[sizeof(u64)]; // for size }; -/* - * cap struct size + flock buffer size + inline version + inline data size + - * osd_epoch_barrier + oldest_flush_tid - */ -#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ - 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) - /* Marshal up the cap msg to the MDS */ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { @@ -1240,7 +1238,7 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); - msg->hdr.version = cpu_to_le16(10); + msg->hdr.version = cpu_to_le16(12); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; @@ -1311,6 +1309,21 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); + + /* dirstats (version 11) - these are r/o on the client */ + ceph_encode_64(&p, 0); + ceph_encode_64(&p, 0); + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + /* fscrypt_auth and fscrypt_file (version 12) */ + ceph_encode_32(&p, arg->fscrypt_auth_len); + ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); + ceph_encode_32(&p, arg->fscrypt_file_len); + ceph_encode_copy(&p, arg->fscrypt_file, arg->fscrypt_file_len); +#else /* CONFIG_FS_ENCRYPTION */ + ceph_encode_32(&p, 0); + ceph_encode_32(&p, 0); +#endif /* CONFIG_FS_ENCRYPTION */ } /* @@ -1432,7 +1445,37 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, } } arg->flags = flags; +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len && + WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { + /* Don't set this if it's too big */ + arg->fscrypt_auth_len = 0; + } else { + arg->fscrypt_auth_len = ci->fscrypt_auth_len; + memcpy(arg->fscrypt_auth, ci->fscrypt_auth, + min_t(size_t, ci->fscrypt_auth_len, + sizeof(arg->fscrypt_auth))); + } + /* FIXME: use this to track "real" size */ + arg->fscrypt_file_len = 0; +#endif /* CONFIG_FS_ENCRYPTION */ +} + +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len + + arg->fscrypt_file_len; } +#else +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS; +} +#endif /* CONFIG_FS_ENCRYPTION */ /* * Send a cap msg on the given inode. @@ -1444,7 +1487,8 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) struct ceph_msg *msg; struct inode *inode = &ci->netfs.inode; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, + false); if (!msg) { pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), @@ -1470,10 +1514,6 @@ static inline int __send_flush_snap(struct inode *inode, struct cap_msg_args arg; struct ceph_msg *msg; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); - if (!msg) - return -ENOMEM; - arg.session = session; arg.ino = ceph_vino(inode).ino; arg.cid = 0; @@ -1511,6 +1551,18 @@ static inline int __send_flush_snap(struct inode *inode, arg.flags = 0; arg.wake = false; + /* + * No fscrypt_auth changes from a capsnap. It will need + * to update fscrypt_file on size changes (TODO). + */ + arg.fscrypt_auth_len = 0; + arg.fscrypt_file_len = 0; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), + GFP_NOFS, false); + if (!msg) + return -ENOMEM; + encode_cap_msg(msg, &arg); ceph_con_send(&arg.session->s_con, msg); return 0; -- cgit v1.2.3 From 3fd945a79e147ee10f84213976889b29049c3519 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 7 Aug 2020 09:28:31 -0400 Subject: ceph: encode encrypted name in ceph_mdsc_build_path and dentry release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow ceph_mdsc_build_path to encrypt and base64 encode the filename when the parent is encrypted and we're sending the path to the MDS. In a similar fashion, encode encrypted dentry names if including a dentry release in a request. In most cases, we just encrypt the filenames and base64 encode them, but when the name is longer than CEPH_NOHASH_NAME_MAX, we use a similar scheme to fscrypt proper, and hash the remaning bits with sha256. When doing this, we then send along the full crypttext of the name in the new alternate_name field of the MClientRequest. The MDS can then send that along in readdir responses and traces. [ idryomov: drop duplicate include reported by Abaci Robot ] Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 32 ++++++++++++++-- fs/ceph/crypto.c | 53 +++++++++++++++++++++++++++ fs/ceph/crypto.h | 9 +++++ fs/ceph/mds_client.c | 101 +++++++++++++++++++++++++++++++++++++++++---------- fs/ceph/mds_client.h | 2 +- 5 files changed, 172 insertions(+), 25 deletions(-) (limited to 'fs/ceph/caps.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1c62ef339bc6..4738be54dc73 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4663,6 +4663,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode, return ret; } +/** + * ceph_encode_dentry_release - encode a dentry release into an outgoing request + * @p: outgoing request buffer + * @dentry: dentry to release + * @dir: dir to release it from + * @mds: mds that we're speaking to + * @drop: caps being dropped + * @unless: unless we have these caps + * + * Encode a dentry release into an outgoing request buffer. Returns 1 if the + * thing was released, or a negative error code otherwise. + */ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) @@ -4695,13 +4707,25 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, if (ret && di->lease_session && di->lease_session->s_mds == mds) { dout("encode_dentry_release %p mds%d seq %d\n", dentry, mds, (int)di->lease_seq); - rel->dname_len = cpu_to_le32(dentry->d_name.len); - memcpy(*p, dentry->d_name.name, dentry->d_name.len); - *p += dentry->d_name.len; rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); + spin_unlock(&dentry->d_lock); + if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { + int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p); + + if (ret2 < 0) + return ret2; + + rel->dname_len = cpu_to_le32(ret2); + *p += ret2; + } else { + rel->dname_len = cpu_to_le32(dentry->d_name.len); + memcpy(*p, dentry->d_name.name, dentry->d_name.len); + *p += dentry->d_name.len; + } + } else { + spin_unlock(&dentry->d_lock); } - spin_unlock(&dentry->d_lock); return ret; } diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index 0bb4d8e9f3b0..6dc723e8d395 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -191,3 +191,56 @@ void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, { swap(req->r_fscrypt_auth, as->fscrypt_auth); } + +int ceph_encode_encrypted_fname(const struct inode *parent, + struct dentry *dentry, char *buf) +{ + u32 len; + int elen; + int ret; + u8 *cryptbuf; + + WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)); + + /* + * Convert cleartext d_name to ciphertext. If result is longer than + * CEPH_NOHASH_NAME_MAX, sha256 the remaining bytes + * + * See: fscrypt_setup_filename + */ + if (!fscrypt_fname_encrypted_size(parent, dentry->d_name.len, NAME_MAX, + &len)) + return -ENAMETOOLONG; + + /* Allocate a buffer appropriate to hold the result */ + cryptbuf = kmalloc(len > CEPH_NOHASH_NAME_MAX ? NAME_MAX : len, + GFP_KERNEL); + if (!cryptbuf) + return -ENOMEM; + + ret = fscrypt_fname_encrypt(parent, &dentry->d_name, cryptbuf, len); + if (ret) { + kfree(cryptbuf); + return ret; + } + + /* hash the end if the name is long enough */ + if (len > CEPH_NOHASH_NAME_MAX) { + u8 hash[SHA256_DIGEST_SIZE]; + u8 *extra = cryptbuf + CEPH_NOHASH_NAME_MAX; + + /* + * hash the extra bytes and overwrite crypttext beyond that + * point with it + */ + sha256(extra, len - CEPH_NOHASH_NAME_MAX, hash); + memcpy(extra, hash, SHA256_DIGEST_SIZE); + len = CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE; + } + + /* base64 encode the encrypted name */ + elen = ceph_base64_encode(cryptbuf, len, buf); + kfree(cryptbuf); + dout("base64-encoded ciphertext name = %.*s\n", elen, buf); + return elen; +} diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h index 44eb3f9f8287..176731f64997 100644 --- a/fs/ceph/crypto.h +++ b/fs/ceph/crypto.h @@ -6,6 +6,7 @@ #ifndef _CEPH_CRYPTO_H #define _CEPH_CRYPTO_H +#include #include struct ceph_fs_client; @@ -67,6 +68,8 @@ int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, struct ceph_acl_sec_ctx *as); void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, struct ceph_acl_sec_ctx *as); +int ceph_encode_encrypted_fname(const struct inode *parent, + struct dentry *dentry, char *buf); #else /* CONFIG_FS_ENCRYPTION */ @@ -91,6 +94,12 @@ static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, struct ceph_acl_sec_ctx *as_ctx) { } + +static inline int ceph_encode_encrypted_fname(const struct inode *parent, + struct dentry *dentry, char *buf) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_FS_ENCRYPTION */ #endif diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c3927dab5a3b..882c01a64498 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2435,18 +2435,29 @@ static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) return mdsc->oldest_tid; } -/* - * Build a dentry's path. Allocate on heap; caller must kfree. Based - * on build_path_from_dentry in fs/cifs/dir.c. +/** + * ceph_mdsc_build_path - build a path string to a given dentry + * @dentry: dentry to which path should be built + * @plen: returned length of string + * @pbase: returned base inode number + * @for_wire: is this path going to be sent to the MDS? + * + * Build a string that represents the path to the dentry. This is mostly called + * for two different purposes: * - * If @stop_on_nosnap, generate path relative to the first non-snapped - * inode. + * 1) we need to build a path string to send to the MDS (for_wire == true) + * 2) we need a path string for local presentation (e.g. debugfs) + * (for_wire == false) + * + * The path is built in reverse, starting with the dentry. Walk back up toward + * the root, building the path until the first non-snapped inode is reached + * (for_wire) or the root inode is reached (!for_wire). * * Encode hidden .snap dirs as a double /, i.e. * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, - int stop_on_nosnap) + int for_wire) { struct dentry *cur; struct inode *inode; @@ -2468,30 +2479,67 @@ retry: seq = read_seqbegin(&rename_lock); cur = dget(dentry); for (;;) { - struct dentry *temp; + struct dentry *parent; spin_lock(&cur->d_lock); inode = d_inode(cur); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { dout("build_path path+%d: %p SNAPDIR\n", pos, cur); - } else if (stop_on_nosnap && inode && dentry != cur && + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + } else if (for_wire && inode && dentry != cur && ceph_snap(inode) == CEPH_NOSNAP) { spin_unlock(&cur->d_lock); pos++; /* get rid of any prepended '/' */ break; - } else { + } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) { pos -= cur->d_name.len; if (pos < 0) { spin_unlock(&cur->d_lock); break; } memcpy(path + pos, cur->d_name.name, cur->d_name.len); + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + } else { + int len, ret; + char buf[NAME_MAX]; + + /* + * Proactively copy name into buf, in case we need to + * present it as-is. + */ + memcpy(buf, cur->d_name.name, cur->d_name.len); + len = cur->d_name.len; + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + + ret = __fscrypt_prepare_readdir(d_inode(parent)); + if (ret < 0) { + dput(parent); + dput(cur); + return ERR_PTR(ret); + } + + if (fscrypt_has_encryption_key(d_inode(parent))) { + len = ceph_encode_encrypted_fname(d_inode(parent), + cur, buf); + if (len < 0) { + dput(parent); + dput(cur); + return ERR_PTR(len); + } + } + pos -= len; + if (pos < 0) { + dput(parent); + break; + } + memcpy(path + pos, buf, len); } - temp = cur; - spin_unlock(&temp->d_lock); - cur = dget_parent(temp); - dput(temp); + dput(cur); + cur = parent; /* Are we at the root? */ if (IS_ROOT(cur)) @@ -2515,8 +2563,8 @@ retry: * A rename didn't occur, but somehow we didn't end up where * we thought we would. Throw a warning and try again. */ - pr_warn("build_path did not end path lookup where " - "expected, pos is %d\n", pos); + pr_warn("build_path did not end path lookup where expected (pos = %d)\n", + pos); goto retry; } @@ -2536,7 +2584,8 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, rcu_read_lock(); if (!dir) dir = d_inode_rcu(dentry->d_parent); - if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) { + if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && + !IS_ENCRYPTED(dir)) { *pino = ceph_ino(dir); rcu_read_unlock(); *ppath = dentry->d_name.name; @@ -2765,15 +2814,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, req->r_inode ? req->r_inode : d_inode(req->r_dentry), mds, req->r_inode_drop, req->r_inode_unless, req->r_op == CEPH_MDS_OP_READDIR); - if (req->r_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_dentry, + if (req->r_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_dentry, req->r_parent, mds, req->r_dentry_drop, req->r_dentry_unless); - if (req->r_old_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_old_dentry, + if (ret < 0) + goto out_err; + releases += ret; + } + if (req->r_old_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_old_dentry, req->r_old_dentry_dir, mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + if (ret < 0) + goto out_err; + releases += ret; + } if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, d_inode(req->r_old_dentry), @@ -2815,6 +2872,10 @@ out_free1: ceph_mdsc_free_path((char *)path1, pathlen1); out: return msg; +out_err: + ceph_msg_put(msg); + msg = ERR_PTR(ret); + goto out_free2; } /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index a2e85fb5aab1..1105baa97d49 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -565,7 +565,7 @@ static inline void ceph_mdsc_free_path(char *path, int len) } extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, - int stop_on_nosnap); + int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, -- cgit v1.2.3 From 16be62fc8a53482529201b4be6bbcd0de3a058cb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 25 Aug 2022 09:31:06 -0400 Subject: ceph: size handling in MClientRequest, cap updates and inode traces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For encrypted inodes, transmit a rounded-up size to the MDS as the normal file size and send the real inode size in fscrypt_file field. Also, fix up creates and truncates to also transmit fscrypt_file. When we get an inode trace from the MDS, grab the fscrypt_file field if the inode is encrypted, and use it to populate the i_size field instead of the regular inode size field. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 44 ++++++++++++++++++++++++++------------------ fs/ceph/dir.c | 3 +++ fs/ceph/file.c | 1 + fs/ceph/inode.c | 33 ++++++++++++++++++++++++++++++--- fs/ceph/mds_client.c | 9 ++++++++- fs/ceph/mds_client.h | 2 ++ 6 files changed, 70 insertions(+), 22 deletions(-) (limited to 'fs/ceph/caps.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 4738be54dc73..aff256274415 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1217,10 +1217,9 @@ struct cap_msg_args { umode_t mode; bool inline_data; bool wake; + bool encrypted; u32 fscrypt_auth_len; - u32 fscrypt_file_len; u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context - u8 fscrypt_file[sizeof(u64)]; // for size }; /* Marshal up the cap msg to the MDS */ @@ -1255,7 +1254,13 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) fc->ino = cpu_to_le64(arg->ino); fc->snap_follows = cpu_to_le64(arg->follows); - fc->size = cpu_to_le64(arg->size); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (arg->encrypted) + fc->size = cpu_to_le64(round_up(arg->size, + CEPH_FSCRYPT_BLOCK_SIZE)); + else +#endif + fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); ceph_encode_timespec64(&fc->mtime, &arg->mtime); ceph_encode_timespec64(&fc->atime, &arg->atime); @@ -1315,11 +1320,17 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) ceph_encode_64(&p, 0); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) - /* fscrypt_auth and fscrypt_file (version 12) */ + /* + * fscrypt_auth and fscrypt_file (version 12) + * + * fscrypt_auth holds the crypto context (if any). fscrypt_file + * tracks the real i_size as an __le64 field (and we use a rounded-up + * i_size in the traditional size field). + */ ceph_encode_32(&p, arg->fscrypt_auth_len); ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); - ceph_encode_32(&p, arg->fscrypt_file_len); - ceph_encode_copy(&p, arg->fscrypt_file, arg->fscrypt_file_len); + ceph_encode_32(&p, sizeof(__le64)); + ceph_encode_64(&p, arg->size); #else /* CONFIG_FS_ENCRYPTION */ ceph_encode_32(&p, 0); ceph_encode_32(&p, 0); @@ -1391,7 +1402,6 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->follows = flushing ? ci->i_head_snapc->seq : 0; arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; - arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; @@ -1445,6 +1455,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, } } arg->flags = flags; + arg->encrypted = IS_ENCRYPTED(inode); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (ci->fscrypt_auth_len && WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { @@ -1456,21 +1467,21 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, min_t(size_t, ci->fscrypt_auth_len, sizeof(arg->fscrypt_auth))); } - /* FIXME: use this to track "real" size */ - arg->fscrypt_file_len = 0; #endif /* CONFIG_FS_ENCRYPTION */ } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ - 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) static inline int cap_msg_size(struct cap_msg_args *arg) { - return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len + - arg->fscrypt_file_len; + return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; } #else +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) + static inline int cap_msg_size(struct cap_msg_args *arg) { return CAP_MSG_FIXED_FIELDS; @@ -1550,13 +1561,10 @@ static inline int __send_flush_snap(struct inode *inode, arg.inline_data = capsnap->inline_data; arg.flags = 0; arg.wake = false; + arg.encrypted = IS_ENCRYPTED(inode); - /* - * No fscrypt_auth changes from a capsnap. It will need - * to update fscrypt_file on size changes (TODO). - */ + /* No fscrypt_auth changes from a capsnap.*/ arg.fscrypt_auth_len = 0; - arg.fscrypt_file_len = 0; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), GFP_NOFS, false); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 08504afbe242..4990886a366c 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -915,6 +915,9 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, goto out_req; } + if (S_ISREG(mode) && IS_ENCRYPTED(dir)) + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6e9ae398ef2c..c3ce224db032 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -790,6 +790,7 @@ retry: req->r_parent = dir; ihold(dir); if (IS_ENCRYPTED(dir)) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); if (!fscrypt_has_encryption_key(dir)) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9ec4ebe5ccaf..841f60e8de6a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1028,6 +1028,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + u64 size = le64_to_cpu(info->size); s64 old_pool = ci->i_layout.pool_id; struct ceph_string *old_ns; @@ -1041,10 +1042,22 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, pool_ns = old_ns; + if (IS_ENCRYPTED(inode) && size && + iinfo->fscrypt_file_len == sizeof(__le64)) { + u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file); + + if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) { + size = fsize; + } else { + pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n", + info->size, size); + } + } + queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); + size); /* only update max_size on auth cap */ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && ci->i_max_size != le64_to_cpu(info->max_size)) { @@ -2388,11 +2401,25 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr, } } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || attr->ia_size != isize) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = cpu_to_le64(isize); mask |= CEPH_SETATTR_SIZE; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + if (IS_ENCRYPTED(inode) && attr->ia_size) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + /* FIXME: client must zero out any partial blocks! */ + } else { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = cpu_to_le64(isize); + req->r_fscrypt_file = 0; + } } } if (ia_valid & ATTR_MTIME) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7de22052ee22..c257d75c5757 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2832,7 +2832,12 @@ static void encode_mclientrequest_tail(void **p, } else { ceph_encode_32(p, 0); } - ceph_encode_32(p, 0); // fscrypt_file for now + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) { + ceph_encode_32(p, sizeof(__le64)); + ceph_encode_64(p, req->r_fscrypt_file); + } else { + ceph_encode_32(p, 0); + } } /* @@ -2922,6 +2927,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, /* fscrypt_file */ len += sizeof(u32); + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) + len += sizeof(__le64); msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index a8f622439f38..0477388a0d1c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -282,6 +282,7 @@ struct ceph_mds_request { #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ #define CEPH_MDS_R_ASYNC (8) /* async request */ +#define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */ unsigned long r_req_flags; struct mutex r_fill_mutex; @@ -289,6 +290,7 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; struct ceph_fscrypt_auth *r_fscrypt_auth; + u64 r_fscrypt_file; u8 *r_altname; /* fscrypt binary crypttext for long filenames */ u32 r_altname_len; /* length of r_altname */ -- cgit v1.2.3 From 0d91f0ad6a01c8c64a84c5255c5ab95133d0fed5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 25 Aug 2022 09:31:09 -0400 Subject: ceph: handle fscrypt fields in cap messages from MDS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handle the new fscrypt_file and fscrypt_auth fields in cap messages. Use them to populate new fields in cap_extra_info and update the inode with those values. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 2 deletions(-) (limited to 'fs/ceph/caps.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index aff256274415..ee61b539a8ec 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3383,6 +3383,9 @@ struct cap_extra_info { /* currently issued */ int issued; struct timespec64 btime; + u8 *fscrypt_auth; + u32 fscrypt_auth_len; + u64 fscrypt_file_size; }; /* @@ -3415,6 +3418,14 @@ static void handle_cap_grant(struct inode *inode, bool deleted_inode = false; bool fill_inline = false; + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, @@ -3481,6 +3492,14 @@ static void handle_cap_grant(struct inode *inode, dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, from_kuid(&init_user_ns, inode->i_uid), from_kgid(&init_user_ns, inode->i_gid)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || + memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, + ci->fscrypt_auth_len)) + pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", + __func__, ci->fscrypt_auth_len, + extra_info->fscrypt_auth_len); +#endif } if ((newcaps & CEPH_CAP_LINK_SHARED) && @@ -3897,7 +3916,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, */ static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + struct cap_extra_info *extra_info) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -3914,6 +3934,14 @@ static bool handle_cap_trunc(struct inode *inode, issued |= implemented | dirty; + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, @@ -4135,6 +4163,52 @@ retry: *target_cap = cap; } +#ifdef CONFIG_FS_ENCRYPTION +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); + if (extra->fscrypt_auth_len) { + ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); + extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, + GFP_KERNEL); + if (!extra->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, extra->fscrypt_auth, + extra->fscrypt_auth_len, bad); + } + + ceph_decode_32_safe(p, end, len, bad); + if (len >= sizeof(u64)) { + ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); + len -= sizeof(u64); + } + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; +} +#else +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + /* Don't care about these fields unless we're encryption-capable */ + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; +} +#endif + /* * Handle a caps message from the MDS. * @@ -4255,6 +4329,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); } + if (msg_version >= 12) { + if (parse_fscrypt_fields(&p, end, &extra_info)) + goto bad; + } + /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, @@ -4352,7 +4431,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; case CEPH_CAP_OP_TRUNC: - queue_trunc = handle_cap_trunc(inode, h, session); + queue_trunc = handle_cap_trunc(inode, h, session, + &extra_info); spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); @@ -4375,6 +4455,7 @@ out: if (close_sessions) ceph_mdsc_close_sessions(mdsc); + kfree(extra_info.fscrypt_auth); return; flush_cap_releases: -- cgit v1.2.3 From 5c64737d253683b7d138dde0da513a9ade16a170 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 25 Aug 2022 09:31:11 -0400 Subject: ceph: add truncate size handling support for fscrypt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will transfer the encrypted last block contents to the MDS along with the truncate request only when the new size is smaller and not aligned to the fscrypt BLOCK size. When the last block is located in the file hole, the truncate request will only contain the header. The MDS could fail to do the truncate if there has another client or process has already updated the RADOS object which contains the last block, and will return -EAGAIN, then the kclient needs to retry it. The RMW will take around 50ms, and will let it retry 20 times for now. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 18 +++-- fs/ceph/crypto.h | 21 ++++++ fs/ceph/inode.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/ceph/super.h | 7 ++ 4 files changed, 234 insertions(+), 12 deletions(-) (limited to 'fs/ceph/caps.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ee61b539a8ec..5c2b28ac9410 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2960,10 +2960,9 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) +int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, + int want, loff_t endoff, int *got) { - struct ceph_file_info *fi = filp->private_data; - struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int ret, _got, flags; @@ -2972,7 +2971,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got if (ret < 0) return ret; - if ((fi->fmode & CEPH_FILE_MODE_WR) && + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; @@ -3025,7 +3024,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got continue; } - if ((fi->fmode & CEPH_FILE_MODE_WR) && + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) { if (ret >= 0 && _got) ceph_put_cap_refs(ci, _got); @@ -3088,6 +3087,15 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got return 0; } +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, + int *got) +{ + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = file_inode(filp); + + return __ceph_get_caps(inode, fi, need, want, endoff, got); +} + /* * Take cap refs. Caller must already know we hold at least one ref * on the caps in question or we don't know this is safe. diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h index 7a03275f3ad5..b735b3f144a7 100644 --- a/fs/ceph/crypto.h +++ b/fs/ceph/crypto.h @@ -26,6 +26,27 @@ struct ceph_fname { bool no_copy; }; +/* + * Header for the crypted file when truncating the size, this + * will be sent to MDS, and the MDS will update the encrypted + * last block and then truncate the size. + */ +struct ceph_fscrypt_truncate_size_header { + __u8 ver; + __u8 compat; + + /* + * It will be sizeof(assert_ver + file_offset + block_size) + * if the last block is empty when it's located in a file + * hole. Or the data_len will plus CEPH_FSCRYPT_BLOCK_SIZE. + */ + __le32 data_len; + + __le64 change_attr; + __le64 file_offset; + __le32 block_size; +} __packed; + struct ceph_fscrypt_auth { __le32 cfa_version; __le32 cfa_blob_len; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 841f60e8de6a..054fd66609b7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -596,6 +596,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; + ci->i_truncate_pagecache_size = 0; ci->i_max_size = 0; ci->i_reported_size = 0; @@ -767,6 +768,10 @@ int ceph_fill_file_size(struct inode *inode, int issued, dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, truncate_size); ci->i_truncate_size = truncate_size; + if (IS_ENCRYPTED(inode)) + ci->i_truncate_pagecache_size = size; + else + ci->i_truncate_pagecache_size = truncate_size; } return queue_trunc; } @@ -2147,7 +2152,7 @@ retry: /* there should be no reader or writer */ WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); - to = ci->i_truncate_size; + to = ci->i_truncate_pagecache_size; wrbuffer_refs = ci->i_wrbuffer_ref; dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, ci->i_truncate_pending, to); @@ -2157,7 +2162,7 @@ retry: truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); - if (to == ci->i_truncate_size) { + if (to == ci->i_truncate_pagecache_size) { ci->i_truncate_pending = 0; finish = 1; } @@ -2241,6 +2246,144 @@ static const struct inode_operations ceph_encrypted_symlink_iops = { .listxattr = ceph_listxattr, }; +/* + * Transfer the encrypted last block to the MDS and the MDS + * will help update it when truncating a smaller size. + * + * We don't support a PAGE_SIZE that is smaller than the + * CEPH_FSCRYPT_BLOCK_SIZE. + */ +static int fill_fscrypt_truncate(struct inode *inode, + struct ceph_mds_request *req, + struct iattr *attr) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE; + loff_t pos, orig_pos = round_down(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE); + u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT; + struct ceph_pagelist *pagelist = NULL; + struct kvec iov = {0}; + struct iov_iter iter; + struct page *page = NULL; + struct ceph_fscrypt_truncate_size_header header; + int retry_op = 0; + int len = CEPH_FSCRYPT_BLOCK_SIZE; + loff_t i_size = i_size_read(inode); + int got, ret, issued; + u64 objver; + + ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got); + if (ret < 0) + return ret; + + issued = __ceph_caps_issued(ci, NULL); + + dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__, + i_size, attr->ia_size, ceph_cap_string(got), + ceph_cap_string(issued)); + + /* Try to writeback the dirty pagecaches */ + if (issued & (CEPH_CAP_FILE_BUFFER)) { + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; + + ret = filemap_write_and_wait_range(inode->i_mapping, + orig_pos, lend); + if (ret < 0) + goto out; + } + + page = __page_cache_alloc(GFP_KERNEL); + if (page == NULL) { + ret = -ENOMEM; + goto out; + } + + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) { + ret = -ENOMEM; + goto out; + } + + iov.iov_base = kmap_local_page(page); + iov.iov_len = len; + iov_iter_kvec(&iter, READ, &iov, 1, len); + + pos = orig_pos; + ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver); + if (ret < 0) + goto out; + + /* Insert the header first */ + header.ver = 1; + header.compat = 1; + header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode)); + + /* + * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE, + * because in MDS it may need this to do the truncate. + */ + header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE); + + /* + * If we hit a hole here, we should just skip filling + * the fscrypt for the request, because once the fscrypt + * is enabled, the file will be split into many blocks + * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there + * has a hole, the hole size should be multiple of block + * size. + * + * If the Rados object doesn't exist, it will be set to 0. + */ + if (!objver) { + dout("%s hit hole, ppos %lld < size %lld\n", __func__, + pos, i_size); + + header.data_len = cpu_to_le32(8 + 8 + 4); + header.file_offset = 0; + ret = 0; + } else { + header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE); + header.file_offset = cpu_to_le64(orig_pos); + + /* truncate and zero out the extra contents for the last block */ + memset(iov.iov_base + boff, 0, PAGE_SIZE - boff); + + /* encrypt the last block */ + ret = ceph_fscrypt_encrypt_block_inplace(inode, page, + CEPH_FSCRYPT_BLOCK_SIZE, + 0, block, + GFP_KERNEL); + if (ret) + goto out; + } + + /* Insert the header */ + ret = ceph_pagelist_append(pagelist, &header, sizeof(header)); + if (ret) + goto out; + + if (header.block_size) { + /* Append the last block contents to pagelist */ + ret = ceph_pagelist_append(pagelist, iov.iov_base, + CEPH_FSCRYPT_BLOCK_SIZE); + if (ret) + goto out; + } + req->r_pagelist = pagelist; +out: + dout("%s %p size dropping cap refs on %s\n", __func__, + inode, ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + if (iov.iov_base) + kunmap_local(iov.iov_base); + if (page) + __free_pages(page, 0); + if (ret && pagelist) + ceph_pagelist_release(pagelist); + return ret; +} + int __ceph_setattr(struct inode *inode, struct iattr *attr, struct ceph_iattr *cia) { @@ -2249,13 +2392,17 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr, struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf; + loff_t isize = i_size_read(inode); int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; bool lock_snap_rwsem = false; + bool fill_fscrypt; + int truncate_retry = 20; /* The RMW will take around 50ms */ +retry: prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2267,6 +2414,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr, return PTR_ERR(req); } + fill_fscrypt = false; spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); @@ -2388,10 +2536,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr, } } if (ia_valid & ATTR_SIZE) { - loff_t isize = i_size_read(inode); - dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { + /* + * Only when the new size is smaller and not aligned to + * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed. + */ + if (IS_ENCRYPTED(inode) && attr->ia_size < isize && + (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) { + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + fill_fscrypt = true; + } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { if (attr->ia_size > isize) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); @@ -2414,7 +2579,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr, cpu_to_le64(round_up(isize, CEPH_FSCRYPT_BLOCK_SIZE)); req->r_fscrypt_file = attr->ia_size; - /* FIXME: client must zero out any partial blocks! */ } else { req->r_args.setattr.size = cpu_to_le64(attr->ia_size); req->r_args.setattr.old_size = cpu_to_le64(isize); @@ -2481,8 +2645,10 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr, release &= issued; spin_unlock(&ci->i_ceph_lock); - if (lock_snap_rwsem) + if (lock_snap_rwsem) { up_read(&mdsc->snap_rwsem); + lock_snap_rwsem = false; + } if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -2494,7 +2660,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr, req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; req->r_stamp = attr->ia_ctime; + if (fill_fscrypt) { + err = fill_fscrypt_truncate(inode, req, attr); + if (err) + goto out; + } + + /* + * The truncate request will return -EAGAIN when the + * last block has been updated just before the MDS + * successfully gets the xlock for the FILE lock. To + * avoid corrupting the file contents we need to retry + * it. + */ err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == -EAGAIN && truncate_retry--) { + dout("setattr %p result=%d (%s locally, %d remote), retry it!\n", + inode, err, ceph_cap_string(dirtied), mask); + ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); + goto retry; + } } out: dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d60342cc6f33..b5e54c8f010b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -424,6 +424,11 @@ struct ceph_inode_info { u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ + /* + * For none fscrypt case it equals to i_truncate_size or it will + * equals to fscrypt_file_size + */ + u64 i_truncate_pagecache_size; u64 i_max_size; /* max file size authorized by mds */ u64 i_reported_size; /* (max_)size reported to or requested of mds */ @@ -1265,6 +1270,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); +extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, + int need, int want, loff_t endoff, int *got); extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got); extern int ceph_try_get_caps(struct inode *inode, -- cgit v1.2.3 From e3dfcab2080dc1f9a4b09cc1327361bc2845bfcd Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 21 Dec 2022 14:13:51 +0800 Subject: ceph: drop messages from MDS when unmounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When unmounting all the dirty buffers will be flushed and after the last osd request is finished the last reference of the i_count will be released. Then it will flush the dirty cap/snap to MDSs, and the unmounting won't wait the possible acks, which will ihold the inodes when updating the metadata locally but makes no sense any more, of this. This will make the evict_inodes() to skip these inodes. If encrypt is enabled the kernel generate a warning when removing the encrypt keys when the skipped inodes still hold the keyring: WARNING: CPU: 4 PID: 168846 at fs/crypto/keyring.c:242 fscrypt_destroy_keyring+0x7e/0xd0 CPU: 4 PID: 168846 Comm: umount Tainted: G S 6.1.0-rc5-ceph-g72ead199864c #1 Hardware name: Supermicro SYS-5018R-WR/X10SRW-F, BIOS 2.0 12/17/2015 RIP: 0010:fscrypt_destroy_keyring+0x7e/0xd0 RSP: 0018:ffffc9000b277e28 EFLAGS: 00010202 RAX: 0000000000000002 RBX: ffff88810d52ac00 RCX: ffff88810b56aa00 RDX: 0000000080000000 RSI: ffffffff822f3a09 RDI: ffff888108f59000 RBP: ffff8881d394fb88 R08: 0000000000000028 R09: 0000000000000000 R10: 0000000000000001 R11: 11ff4fe6834fcd91 R12: ffff8881d394fc40 R13: ffff888108f59000 R14: ffff8881d394f800 R15: 0000000000000000 FS: 00007fd83f6f1080(0000) GS:ffff88885fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f918d417000 CR3: 000000017f89a005 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: generic_shutdown_super+0x47/0x120 kill_anon_super+0x14/0x30 ceph_kill_sb+0x36/0x90 [ceph] deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x67/0xb0 exit_to_user_mode_prepare+0x23d/0x240 syscall_exit_to_user_mode+0x25/0x60 do_syscall_64+0x40/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fd83dc39e9b Later the kernel will crash when iput() the inodes and dereferencing the "sb->s_master_keys", which has been released by the generic_shutdown_super(). Link: https://tracker.ceph.com/issues/59162 Signed-off-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 6 ++++- fs/ceph/mds_client.c | 12 +++++++-- fs/ceph/mds_client.h | 11 +++++--- fs/ceph/quota.c | 14 +++++----- fs/ceph/snap.c | 10 ++++--- fs/ceph/super.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++---- fs/ceph/super.h | 3 +++ 7 files changed, 109 insertions(+), 22 deletions(-) (limited to 'fs/ceph/caps.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5c2b28ac9410..54041d9c1e25 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4247,6 +4247,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout("handle_caps from mds%d\n", session->s_mds); + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ end = msg->front.iov_base + msg->front.iov_len; if (msg->front.iov_len < sizeof(*h)) @@ -4348,7 +4351,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap, inode); mutex_lock(&session->s_mutex); - inc_session_sequence(session); dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); @@ -4457,6 +4459,8 @@ done: done_unlocked: iput(inode); out: + ceph_dec_mds_stopping_blocker(mdsc); + ceph_put_string(extra_info.pool_ns); /* Defer closing the sessions after s_mutex lock being released */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c257d75c5757..04a881343e43 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4889,6 +4889,9 @@ static void handle_lease(struct ceph_mds_client *mdsc, dout("handle_lease from mds%d\n", mds); + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) goto bad; @@ -4907,8 +4910,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, dname.len, dname.name); mutex_lock(&session->s_mutex); - inc_session_sequence(session); - if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); goto release; @@ -4970,9 +4971,13 @@ release: out: mutex_unlock(&session->s_mutex); iput(inode); + + ceph_dec_mds_stopping_blocker(mdsc); return; bad: + ceph_dec_mds_stopping_blocker(mdsc); + pr_err("corrupt lease message\n"); ceph_msg_dump(msg); } @@ -5168,6 +5173,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) } init_completion(&mdsc->safe_umount_waiters); + spin_lock_init(&mdsc->stopping_lock); + atomic_set(&mdsc->stopping_blockers, 0); + init_completion(&mdsc->stopping_waiter); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 0477388a0d1c..1fa0f78b7b79 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -399,8 +399,9 @@ struct cap_wait { }; enum { - CEPH_MDSC_STOPPING_BEGIN = 1, - CEPH_MDSC_STOPPING_FLUSHED = 2, + CEPH_MDSC_STOPPING_BEGIN = 1, + CEPH_MDSC_STOPPING_FLUSHING = 2, + CEPH_MDSC_STOPPING_FLUSHED = 3, }; /* @@ -419,7 +420,11 @@ struct ceph_mds_client { struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; int max_sessions; /* len of sessions array */ - int stopping; /* true if shutting down */ + + spinlock_t stopping_lock; /* protect snap_empty */ + int stopping; /* the stage of shutting down */ + atomic_t stopping_blockers; + struct completion stopping_waiter; atomic64_t quotarealms_count; /* # realms with quota */ /* diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 64592adfe48f..f7fcf7f08ec6 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -47,25 +47,23 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, struct inode *inode; struct ceph_inode_info *ci; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + if (msg->front.iov_len < sizeof(*h)) { pr_err("%s corrupt message mds%d len %d\n", __func__, session->s_mds, (int)msg->front.iov_len); ceph_msg_dump(msg); - return; + goto out; } - /* increment msg sequence number */ - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); - /* lookup inode */ vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { pr_warn("Failed to find inode %llu\n", vino.ino); - return; + goto out; } ci = ceph_inode(inode); @@ -78,6 +76,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, spin_unlock(&ci->i_ceph_lock); iput(inode); +out: + ceph_dec_mds_stopping_blocker(mdsc); } static struct ceph_quotarealm_inode * diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 343d738448dc..7ddc6bad77ef 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1015,6 +1015,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, int locked_rwsem = 0; bool close_sessions = false; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -1030,10 +1033,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, mds, ceph_snap_op_name(op), split, trace_len); - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); - down_write(&mdsc->snap_rwsem); locked_rwsem = 1; @@ -1151,6 +1150,7 @@ skip_inode: up_write(&mdsc->snap_rwsem); flush_snaps(mdsc); + ceph_dec_mds_stopping_blocker(mdsc); return; bad: @@ -1160,6 +1160,8 @@ out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + ceph_dec_mds_stopping_blocker(mdsc); + if (close_sessions) ceph_mdsc_close_sessions(mdsc); return; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 75dd1b6b3d01..1c14d87ed871 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1462,25 +1462,90 @@ nomem: return -ENOMEM; } +/* + * Return true if it successfully increases the blocker counter, + * or false if the mdsc is in stopping and flushed state. + */ +static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) { + spin_unlock(&mdsc->stopping_lock); + return false; + } + atomic_inc(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + return true; +} + +static void __dec_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (!atomic_dec_return(&mdsc->stopping_blockers) && + mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) + complete_all(&mdsc->stopping_waiter); + spin_unlock(&mdsc->stopping_lock); +} + +/* For metadata IO requests */ +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + mutex_unlock(&session->s_mutex); + + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_client(s); + struct ceph_mds_client *mdsc = fsc->mdsc; + bool wait; dout("kill_sb %p\n", s); - ceph_mdsc_pre_umount(fsc->mdsc); + ceph_mdsc_pre_umount(mdsc); flush_fs_workqueues(fsc); /* * Though the kill_anon_super() will finally trigger the - * sync_filesystem() anyway, we still need to do it here - * and then bump the stage of shutdown to stop the work - * queue as earlier as possible. + * sync_filesystem() anyway, we still need to do it here and + * then bump the stage of shutdown. This will allow us to + * drop any further message, which will increase the inodes' + * i_count reference counters but makes no sense any more, + * from MDSs. + * + * Without this when evicting the inodes it may fail in the + * kill_anon_super(), which will trigger a warning when + * destroying the fscrypt keyring and then possibly trigger + * a further crash in ceph module when the iput() tries to + * evict the inodes later. */ sync_filesystem(s); - fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; + spin_lock(&mdsc->stopping_lock); + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; + wait = !!atomic_read(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + + if (wait && atomic_read(&mdsc->stopping_blockers)) { + long timeleft = wait_for_completion_killable_timeout( + &mdsc->stopping_waiter, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn("umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn("umount was killed, %ld\n", timeleft); + } + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b5e54c8f010b..d8eb35d73a23 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1413,4 +1413,7 @@ extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ -- cgit v1.2.3 From 295fc4aa7de4b72cfd764b75a238f79b9433e3ec Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 6 Mar 2023 15:01:15 +0800 Subject: ceph: fix updating i_truncate_pagecache_size for fscrypt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When fscrypt is enabled we will align the truncate size up to the CEPH_FSCRYPT_BLOCK_SIZE always, so if we truncate the size in the same block more than once, the latter ones will be skipped being invalidated from the page caches. This will force invalidating the page caches by using the smaller size than the real file size. At the same time add more debug log and fix the debug log for truncate code. Link: https://tracker.ceph.com/issues/58834 Signed-off-by: Xiubo Li Reviewed-and-tested-by: Luís Henriques Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 ++-- fs/ceph/inode.c | 35 ++++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 13 deletions(-) (limited to 'fs/ceph/caps.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 54041d9c1e25..028b5140a85d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3950,8 +3950,8 @@ static bool handle_cap_trunc(struct inode *inode, if (IS_ENCRYPTED(inode) && size) size = extra_info->fscrypt_file_size; - dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", - inode, mds, seq, truncate_size, truncate_seq); + dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n", + __func__, inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); return queue_trunc; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 83c420646f90..ea6f966dacd5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -764,7 +764,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, ceph_fscache_update(inode); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { - dout("truncate_seq %u -> %u\n", + dout("%s truncate_seq %u -> %u\n", __func__, ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; @@ -788,15 +788,26 @@ int ceph_fill_file_size(struct inode *inode, int issued, } } } - if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && - ci->i_truncate_size != truncate_size) { - dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, - truncate_size); + + /* + * It's possible that the new sizes of the two consecutive + * size truncations will be in the same fscrypt last block, + * and we need to truncate the corresponding page caches + * anyway. + */ + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) { + dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__, + ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode)); + ci->i_truncate_size = truncate_size; - if (IS_ENCRYPTED(inode)) + + if (IS_ENCRYPTED(inode)) { + dout("%s truncate_pagecache_size %lld -> %llu\n", + __func__, ci->i_truncate_pagecache_size, size); ci->i_truncate_pagecache_size = size; - else + } else { ci->i_truncate_pagecache_size = truncate_size; + } } return queue_trunc; } @@ -2155,7 +2166,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) retry: spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { - dout("__do_pending_vmtruncate %p none pending\n", inode); + dout("%s %p none pending\n", __func__, inode); spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); return; @@ -2167,8 +2178,7 @@ retry: */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { spin_unlock(&ci->i_ceph_lock); - dout("__do_pending_vmtruncate %p flushing snaps first\n", - inode); + dout("%s %p flushing snaps first\n", __func__, inode); filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; @@ -2179,7 +2189,7 @@ retry: to = ci->i_truncate_pagecache_size; wrbuffer_refs = ci->i_wrbuffer_ref; - dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, + dout("%s %p (%d) to %lld\n", __func__, inode, ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); @@ -2371,6 +2381,9 @@ static int fill_fscrypt_truncate(struct inode *inode, header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE); header.file_offset = cpu_to_le64(orig_pos); + dout("%s encrypt block boff/bsize %d/%lu\n", __func__, + boff, CEPH_FSCRYPT_BLOCK_SIZE); + /* truncate and zero out the extra contents for the last block */ memset(iov.iov_base + boff, 0, PAGE_SIZE - boff); -- cgit v1.2.3