summaryrefslogtreecommitdiff
path: root/fs/ceph/inode.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-09-06 22:10:15 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-09-06 22:10:15 +0300
commit7ba2090ca64ea1aa435744884124387db1fac70f (patch)
treeed4ea24f4cfed5f28b9c8cdf99dbdf7df6a221ae /fs/ceph/inode.c
parent744a759492b5c57ff24a6e8aabe47b17ad8ee964 (diff)
parentce0d5bd3a6c176f9a3bf867624a07119dd4d0878 (diff)
downloadlinux-7ba2090ca64ea1aa435744884124387db1fac70f.tar.xz
Merge tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "Mixed with some fixes and cleanups, this brings in reasonably complete fscrypt support to CephFS! The list of things which don't work with encryption should be fairly short, mostly around the edges: fallocate (not supported well in CephFS to begin with), copy_file_range (requires re-encryption), non-default striping patterns. This was a multi-year effort principally by Jeff Layton with assistance from Xiubo Li, Luís Henriques and others, including several dependant changes in the MDS, netfs helper library and fscrypt framework itself" * tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client: (53 commits) ceph: make num_fwd and num_retry to __u32 ceph: make members in struct ceph_mds_request_args_ext a union rbd: use list_for_each_entry() helper libceph: do not include crypto/algapi.h ceph: switch ceph_lookup/atomic_open() to use new fscrypt helper ceph: fix updating i_truncate_pagecache_size for fscrypt ceph: wait for OSD requests' callbacks to finish when unmounting ceph: drop messages from MDS when unmounting ceph: update documentation regarding snapshot naming limitations ceph: prevent snapshot creation in encrypted locked directories ceph: add support for encrypted snapshot names ceph: invalidate pages when doing direct/sync writes ceph: plumb in decryption during reads ceph: add encryption support to writepage and writepages ceph: add read/modify/write to ceph_sync_write ceph: align data in pages in ceph_sync_write ceph: don't use special DIO path for encrypted inodes ceph: add truncate size handling support for fscrypt ceph: add object version support for sync read libceph: allow ceph_osdc_new_request to accept a multi-op read ...
Diffstat (limited to 'fs/ceph/inode.c')
-rw-r--r--fs/ceph/inode.c625
1 files changed, 571 insertions, 54 deletions
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fd05d68e2990..800ab7920513 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -14,10 +14,12 @@
#include <linux/random.h>
#include <linux/sort.h>
#include <linux/iversion.h>
+#include <linux/fscrypt.h>
#include "super.h"
#include "mds_client.h"
#include "cache.h"
+#include "crypto.h"
#include <linux/ceph/decode.h>
/*
@@ -33,6 +35,7 @@
*/
static const struct inode_operations ceph_symlink_iops;
+static const struct inode_operations ceph_encrypted_symlink_iops;
static void ceph_inode_work(struct work_struct *work);
@@ -52,17 +55,99 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
-struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+/**
+ * ceph_new_inode - allocate a new inode in advance of an expected create
+ * @dir: parent directory for new inode
+ * @dentry: dentry that may eventually point to new inode
+ * @mode: mode of new inode
+ * @as_ctx: pointer to inherited security context
+ *
+ * Allocate a new inode in advance of an operation to create a new inode.
+ * This allocates the inode and sets up the acl_sec_ctx with appropriate
+ * info for the new inode.
+ *
+ * Returns a pointer to the new inode or an ERR_PTR.
+ */
+struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
+ umode_t *mode, struct ceph_acl_sec_ctx *as_ctx)
+{
+ int err;
+ struct inode *inode;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!S_ISLNK(*mode)) {
+ err = ceph_pre_init_acls(dir, mode, as_ctx);
+ if (err < 0)
+ goto out_err;
+ }
+
+ inode->i_state = 0;
+ inode->i_mode = *mode;
+
+ err = ceph_security_init_secctx(dentry, *mode, as_ctx);
+ if (err < 0)
+ goto out_err;
+
+ /*
+ * We'll skip setting fscrypt context for snapshots, leaving that for
+ * the handle_reply().
+ */
+ if (ceph_snap(dir) != CEPH_SNAPDIR) {
+ err = ceph_fscrypt_prepare_context(dir, inode, as_ctx);
+ if (err)
+ goto out_err;
+ }
+
+ return inode;
+out_err:
+ iput(inode);
+ return ERR_PTR(err);
+}
+
+void ceph_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+ if (as_ctx->pagelist) {
+ req->r_pagelist = as_ctx->pagelist;
+ as_ctx->pagelist = NULL;
+ }
+ ceph_fscrypt_as_ctx_to_req(req, as_ctx);
+}
+
+/**
+ * ceph_get_inode - find or create/hash a new inode
+ * @sb: superblock to search and allocate in
+ * @vino: vino to search for
+ * @newino: optional new inode to insert if one isn't found (may be NULL)
+ *
+ * Search for or insert a new inode into the hash for the given vino, and
+ * return a reference to it. If new is non-NULL, its reference is consumed.
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
+ struct inode *newino)
{
struct inode *inode;
if (ceph_vino_is_reserved(vino))
return ERR_PTR(-EREMOTEIO);
- inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
- ceph_set_ino_cb, &vino);
- if (!inode)
+ if (newino) {
+ inode = inode_insert5(newino, (unsigned long)vino.ino,
+ ceph_ino_compare, ceph_set_ino_cb, &vino);
+ if (inode != newino)
+ iput(newino);
+ } else {
+ inode = iget5_locked(sb, (unsigned long)vino.ino,
+ ceph_ino_compare, ceph_set_ino_cb, &vino);
+ }
+
+ if (!inode) {
+ dout("No inode found for %llx.%llx\n", vino.ino, vino.snap);
return ERR_PTR(-ENOMEM);
+ }
dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
@@ -78,8 +163,9 @@ struct inode *ceph_get_snapdir(struct inode *parent)
.ino = ceph_ino(parent),
.snap = CEPH_SNAPDIR,
};
- struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+ struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL);
struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret = -ENOTDIR;
if (IS_ERR(inode))
return inode;
@@ -105,6 +191,24 @@ struct inode *ceph_get_snapdir(struct inode *parent)
ci->i_rbytes = 0;
ci->i_btime = ceph_inode(parent)->i_btime;
+#ifdef CONFIG_FS_ENCRYPTION
+ /* if encrypted, just borrow fscrypt_auth from parent */
+ if (IS_ENCRYPTED(parent)) {
+ struct ceph_inode_info *pci = ceph_inode(parent);
+
+ ci->fscrypt_auth = kmemdup(pci->fscrypt_auth,
+ pci->fscrypt_auth_len,
+ GFP_KERNEL);
+ if (ci->fscrypt_auth) {
+ inode->i_flags |= S_ENCRYPTED;
+ ci->fscrypt_auth_len = pci->fscrypt_auth_len;
+ } else {
+ dout("Failed to alloc snapdir fscrypt_auth\n");
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+#endif
if (inode->i_state & I_NEW) {
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
@@ -118,7 +222,7 @@ err:
discard_new_inode(inode);
else
iput(inode);
- return ERR_PTR(-ENOTDIR);
+ return ERR_PTR(ret);
}
const struct inode_operations ceph_file_iops = {
@@ -517,6 +621,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_truncate_seq = 0;
ci->i_truncate_size = 0;
ci->i_truncate_pending = 0;
+ ci->i_truncate_pagecache_size = 0;
ci->i_max_size = 0;
ci->i_reported_size = 0;
@@ -547,6 +652,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
+#ifdef CONFIG_FS_ENCRYPTION
+ ci->fscrypt_auth = NULL;
+ ci->fscrypt_auth_len = 0;
+#endif
return &ci->netfs.inode;
}
@@ -555,6 +664,10 @@ void ceph_free_inode(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode);
kfree(ci->i_symlink);
+#ifdef CONFIG_FS_ENCRYPTION
+ kfree(ci->fscrypt_auth);
+#endif
+ fscrypt_free_inode(inode);
kmem_cache_free(ceph_inode_cachep, ci);
}
@@ -575,6 +688,7 @@ void ceph_evict_inode(struct inode *inode)
clear_inode(inode);
ceph_fscache_unregister_inode_cookie(ci);
+ fscrypt_put_encryption_info(inode);
__ceph_remove_caps(ci);
@@ -650,7 +764,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
ceph_fscache_update(inode);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
- dout("truncate_seq %u -> %u\n",
+ dout("%s truncate_seq %u -> %u\n", __func__,
ci->i_truncate_seq, truncate_seq);
ci->i_truncate_seq = truncate_seq;
@@ -674,11 +788,26 @@ int ceph_fill_file_size(struct inode *inode, int issued,
}
}
}
- if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
- ci->i_truncate_size != truncate_size) {
- dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
- truncate_size);
+
+ /*
+ * It's possible that the new sizes of the two consecutive
+ * size truncations will be in the same fscrypt last block,
+ * and we need to truncate the corresponding page caches
+ * anyway.
+ */
+ if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
+ dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__,
+ ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode));
+
ci->i_truncate_size = truncate_size;
+
+ if (IS_ENCRYPTED(inode)) {
+ dout("%s truncate_pagecache_size %lld -> %llu\n",
+ __func__, ci->i_truncate_pagecache_size, size);
+ ci->i_truncate_pagecache_size = size;
+ } else {
+ ci->i_truncate_pagecache_size = truncate_size;
+ }
}
return queue_trunc;
}
@@ -752,6 +881,34 @@ void ceph_fill_file_time(struct inode *inode, int issued,
inode, time_warp_seq, ci->i_time_warp_seq);
}
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
+{
+ int declen;
+ u8 *sym;
+
+ sym = kmalloc(enclen + 1, GFP_NOFS);
+ if (!sym)
+ return -ENOMEM;
+
+ declen = ceph_base64_decode(encsym, enclen, sym);
+ if (declen < 0) {
+ pr_err("%s: can't decode symlink (%d). Content: %.*s\n",
+ __func__, declen, enclen, encsym);
+ kfree(sym);
+ return -EIO;
+ }
+ sym[declen + 1] = '\0';
+ *decsym = sym;
+ return declen;
+}
+#else
+static int decode_encrypted_symlink(const char *encsym, int symlen, u8 **decsym)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
/*
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
@@ -857,15 +1014,20 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
issued |= __ceph_caps_dirty(ci);
new_issued = ~issued & info_caps;
- /* directories have fl_stripe_unit set to zero */
- if (le32_to_cpu(info->layout.fl_stripe_unit))
- inode->i_blkbits =
- fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
- else
- inode->i_blkbits = CEPH_BLOCK_SHIFT;
-
__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
+#ifdef CONFIG_FS_ENCRYPTION
+ if (iinfo->fscrypt_auth_len &&
+ ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) {
+ kfree(ci->fscrypt_auth);
+ ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
+ ci->fscrypt_auth = iinfo->fscrypt_auth;
+ iinfo->fscrypt_auth = NULL;
+ iinfo->fscrypt_auth_len = 0;
+ inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
+ }
+#endif
+
if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
(issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = mode;
@@ -878,6 +1040,15 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
}
+ /* directories have fl_stripe_unit set to zero */
+ if (IS_ENCRYPTED(inode))
+ inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
+ else if (le32_to_cpu(info->layout.fl_stripe_unit))
+ inode->i_blkbits =
+ fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ else
+ inode->i_blkbits = CEPH_BLOCK_SHIFT;
+
if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
(issued & CEPH_CAP_LINK_EXCL) == 0)
set_nlink(inode, le32_to_cpu(info->nlink));
@@ -899,6 +1070,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ u64 size = le64_to_cpu(info->size);
s64 old_pool = ci->i_layout.pool_id;
struct ceph_string *old_ns;
@@ -912,10 +1084,22 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
pool_ns = old_ns;
+ if (IS_ENCRYPTED(inode) && size &&
+ iinfo->fscrypt_file_len == sizeof(__le64)) {
+ u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file);
+
+ if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
+ size = fsize;
+ } else {
+ pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
+ info->size, size);
+ }
+ }
+
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
le64_to_cpu(info->truncate_size),
- le64_to_cpu(info->size));
+ size);
/* only update max_size on auth cap */
if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
ci->i_max_size != le64_to_cpu(info->max_size)) {
@@ -975,26 +1159,42 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_file_fops;
break;
case S_IFLNK:
- inode->i_op = &ceph_symlink_iops;
if (!ci->i_symlink) {
u32 symlen = iinfo->symlink_len;
char *sym;
spin_unlock(&ci->i_ceph_lock);
- if (symlen != i_size_read(inode)) {
- pr_err("%s %llx.%llx BAD symlink "
- "size %lld\n", __func__,
- ceph_vinop(inode),
- i_size_read(inode));
+ if (IS_ENCRYPTED(inode)) {
+ if (symlen != i_size_read(inode))
+ pr_err("%s %llx.%llx BAD symlink size %lld\n",
+ __func__, ceph_vinop(inode),
+ i_size_read(inode));
+
+ err = decode_encrypted_symlink(iinfo->symlink,
+ symlen, (u8 **)&sym);
+ if (err < 0) {
+ pr_err("%s decoding encrypted symlink failed: %d\n",
+ __func__, err);
+ goto out;
+ }
+ symlen = err;
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
- }
+ } else {
+ if (symlen != i_size_read(inode)) {
+ pr_err("%s %llx.%llx BAD symlink size %lld\n",
+ __func__, ceph_vinop(inode),
+ i_size_read(inode));
+ i_size_write(inode, symlen);
+ inode->i_blocks = calc_inode_blocks(symlen);
+ }
- err = -ENOMEM;
- sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
- if (!sym)
- goto out;
+ err = -ENOMEM;
+ sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
+ if (!sym)
+ goto out;
+ }
spin_lock(&ci->i_ceph_lock);
if (!ci->i_symlink)
@@ -1002,7 +1202,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
else
kfree(sym); /* lost a race */
}
- inode->i_link = ci->i_symlink;
+
+ if (IS_ENCRYPTED(inode)) {
+ /*
+ * Encrypted symlinks need to be decrypted before we can
+ * cache their targets in i_link. Don't touch it here.
+ */
+ inode->i_op = &ceph_encrypted_symlink_iops;
+ } else {
+ inode->i_link = ci->i_symlink;
+ inode->i_op = &ceph_symlink_iops;
+ }
break;
case S_IFDIR:
inode->i_op = &ceph_dir_iops;
@@ -1310,8 +1520,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+ bool is_nokey = false;
struct qstr dname;
struct dentry *dn, *parent;
+ struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+ struct ceph_fname fname = { .dir = dir,
+ .name = rinfo->dname,
+ .ctext = rinfo->altname,
+ .name_len = rinfo->dname_len,
+ .ctext_len = rinfo->altname_len };
BUG_ON(!rinfo->head->is_target);
BUG_ON(req->r_dentry);
@@ -1319,8 +1536,20 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
parent = d_find_any_alias(dir);
BUG_ON(!parent);
- dname.name = rinfo->dname;
- dname.len = rinfo->dname_len;
+ err = ceph_fname_alloc_buffer(dir, &oname);
+ if (err < 0) {
+ dput(parent);
+ goto done;
+ }
+
+ err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
+ if (err < 0) {
+ dput(parent);
+ ceph_fname_free_buffer(dir, &oname);
+ goto done;
+ }
+ dname.name = oname.name;
+ dname.len = oname.len;
dname.hash = full_name_hash(parent, dname.name, dname.len);
tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
@@ -1335,9 +1564,15 @@ retry_lookup:
dname.len, dname.name, dn);
if (!dn) {
dput(parent);
+ ceph_fname_free_buffer(dir, &oname);
err = -ENOMEM;
goto done;
}
+ if (is_nokey) {
+ spin_lock(&dn->d_lock);
+ dn->d_flags |= DCACHE_NOKEY_NAME;
+ spin_unlock(&dn->d_lock);
+ }
err = 0;
} else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != tvino.ino ||
@@ -1349,6 +1584,7 @@ retry_lookup:
dput(dn);
goto retry_lookup;
}
+ ceph_fname_free_buffer(dir, &oname);
req->r_dentry = dn;
dput(parent);
@@ -1552,7 +1788,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
vino.ino = le64_to_cpu(rde->inode.in->ino);
vino.snap = le64_to_cpu(rde->inode.in->snapid);
- in = ceph_get_inode(req->r_dentry->d_sb, vino);
+ in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
if (IS_ERR(in)) {
err = PTR_ERR(in);
dout("new_inode badness got %d\n", err);
@@ -1630,7 +1866,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
struct dentry *parent = req->r_dentry;
- struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+ struct inode *inode = d_inode(parent);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct qstr dname;
struct dentry *dn;
@@ -1704,9 +1941,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
tvino.snap = le64_to_cpu(rde->inode.in->snapid);
if (rinfo->hash_order) {
- u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
- rde->name, rde->name_len);
- hash = ceph_frag_value(hash);
+ u32 hash = ceph_frag_value(rde->raw_hash);
if (hash != last_hash)
fpos_offset = 2;
last_hash = hash;
@@ -1729,6 +1964,11 @@ retry_lookup:
err = -ENOMEM;
goto out;
}
+ if (rde->is_nokey) {
+ spin_lock(&dn->d_lock);
+ dn->d_flags |= DCACHE_NOKEY_NAME;
+ spin_unlock(&dn->d_lock);
+ }
} else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != tvino.ino ||
ceph_snap(d_inode(dn)) != tvino.snap)) {
@@ -1754,7 +1994,7 @@ retry_lookup:
if (d_really_is_positive(dn)) {
in = d_inode(dn);
} else {
- in = ceph_get_inode(parent->d_sb, tvino);
+ in = ceph_get_inode(parent->d_sb, tvino, NULL);
if (IS_ERR(in)) {
dout("new_inode badness\n");
d_drop(dn);
@@ -1927,7 +2167,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
retry:
spin_lock(&ci->i_ceph_lock);
if (ci->i_truncate_pending == 0) {
- dout("__do_pending_vmtruncate %p none pending\n", inode);
+ dout("%s %p none pending\n", __func__, inode);
spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
return;
@@ -1939,8 +2179,7 @@ retry:
*/
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
spin_unlock(&ci->i_ceph_lock);
- dout("__do_pending_vmtruncate %p flushing snaps first\n",
- inode);
+ dout("%s %p flushing snaps first\n", __func__, inode);
filemap_write_and_wait_range(&inode->i_data, 0,
inode->i_sb->s_maxbytes);
goto retry;
@@ -1949,9 +2188,9 @@ retry:
/* there should be no reader or writer */
WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
- to = ci->i_truncate_size;
+ to = ci->i_truncate_pagecache_size;
wrbuffer_refs = ci->i_wrbuffer_ref;
- dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+ dout("%s %p (%d) to %lld\n", __func__, inode,
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
@@ -1959,7 +2198,7 @@ retry:
truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
- if (to == ci->i_truncate_size) {
+ if (to == ci->i_truncate_pagecache_size) {
ci->i_truncate_pending = 0;
finish = 1;
}
@@ -2000,6 +2239,32 @@ static void ceph_inode_work(struct work_struct *work)
iput(inode);
}
+static const char *ceph_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode),
+ done);
+}
+
+static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
+ const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ int ret;
+
+ ret = ceph_getattr(idmap, path, stat, request_mask, query_flags);
+ if (ret)
+ return ret;
+ return fscrypt_symlink_getattr(path, stat);
+}
+
/*
* symlinks
*/
@@ -2010,20 +2275,173 @@ static const struct inode_operations ceph_symlink_iops = {
.listxattr = ceph_listxattr,
};
-int __ceph_setattr(struct inode *inode, struct iattr *attr)
+static const struct inode_operations ceph_encrypted_symlink_iops = {
+ .get_link = ceph_encrypted_get_link,
+ .setattr = ceph_setattr,
+ .getattr = ceph_encrypted_symlink_getattr,
+ .listxattr = ceph_listxattr,
+};
+
+/*
+ * Transfer the encrypted last block to the MDS and the MDS
+ * will help update it when truncating a smaller size.
+ *
+ * We don't support a PAGE_SIZE that is smaller than the
+ * CEPH_FSCRYPT_BLOCK_SIZE.
+ */
+static int fill_fscrypt_truncate(struct inode *inode,
+ struct ceph_mds_request *req,
+ struct iattr *attr)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
+ loff_t pos, orig_pos = round_down(attr->ia_size,
+ CEPH_FSCRYPT_BLOCK_SIZE);
+ u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ struct ceph_pagelist *pagelist = NULL;
+ struct kvec iov = {0};
+ struct iov_iter iter;
+ struct page *page = NULL;
+ struct ceph_fscrypt_truncate_size_header header;
+ int retry_op = 0;
+ int len = CEPH_FSCRYPT_BLOCK_SIZE;
+ loff_t i_size = i_size_read(inode);
+ int got, ret, issued;
+ u64 objver;
+
+ ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got);
+ if (ret < 0)
+ return ret;
+
+ issued = __ceph_caps_issued(ci, NULL);
+
+ dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__,
+ i_size, attr->ia_size, ceph_cap_string(got),
+ ceph_cap_string(issued));
+
+ /* Try to writeback the dirty pagecaches */
+ if (issued & (CEPH_CAP_FILE_BUFFER)) {
+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
+
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ orig_pos, lend);
+ if (ret < 0)
+ goto out;
+ }
+
+ page = __page_cache_alloc(GFP_KERNEL);
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pagelist = ceph_pagelist_alloc(GFP_KERNEL);
+ if (!pagelist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ iov.iov_base = kmap_local_page(page);
+ iov.iov_len = len;
+ iov_iter_kvec(&iter, READ, &iov, 1, len);
+
+ pos = orig_pos;
+ ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver);
+ if (ret < 0)
+ goto out;
+
+ /* Insert the header first */
+ header.ver = 1;
+ header.compat = 1;
+ header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
+
+ /*
+ * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
+ * because in MDS it may need this to do the truncate.
+ */
+ header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
+
+ /*
+ * If we hit a hole here, we should just skip filling
+ * the fscrypt for the request, because once the fscrypt
+ * is enabled, the file will be split into many blocks
+ * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
+ * has a hole, the hole size should be multiple of block
+ * size.
+ *
+ * If the Rados object doesn't exist, it will be set to 0.
+ */
+ if (!objver) {
+ dout("%s hit hole, ppos %lld < size %lld\n", __func__,
+ pos, i_size);
+
+ header.data_len = cpu_to_le32(8 + 8 + 4);
+ header.file_offset = 0;
+ ret = 0;
+ } else {
+ header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
+ header.file_offset = cpu_to_le64(orig_pos);
+
+ dout("%s encrypt block boff/bsize %d/%lu\n", __func__,
+ boff, CEPH_FSCRYPT_BLOCK_SIZE);
+
+ /* truncate and zero out the extra contents for the last block */
+ memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
+
+ /* encrypt the last block */
+ ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ 0, block,
+ GFP_KERNEL);
+ if (ret)
+ goto out;
+ }
+
+ /* Insert the header */
+ ret = ceph_pagelist_append(pagelist, &header, sizeof(header));
+ if (ret)
+ goto out;
+
+ if (header.block_size) {
+ /* Append the last block contents to pagelist */
+ ret = ceph_pagelist_append(pagelist, iov.iov_base,
+ CEPH_FSCRYPT_BLOCK_SIZE);
+ if (ret)
+ goto out;
+ }
+ req->r_pagelist = pagelist;
+out:
+ dout("%s %p size dropping cap refs on %s\n", __func__,
+ inode, ceph_cap_string(got));
+ ceph_put_cap_refs(ci, got);
+ if (iov.iov_base)
+ kunmap_local(iov.iov_base);
+ if (page)
+ __free_pages(page, 0);
+ if (ret && pagelist)
+ ceph_pagelist_release(pagelist);
+ return ret;
+}
+
+int __ceph_setattr(struct inode *inode, struct iattr *attr,
+ struct ceph_iattr *cia)
{
struct ceph_inode_info *ci = ceph_inode(inode);
unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf;
+ loff_t isize = i_size_read(inode);
int issued;
int release = 0, dirtied = 0;
int mask = 0;
int err = 0;
int inode_dirty_flags = 0;
bool lock_snap_rwsem = false;
+ bool fill_fscrypt;
+ int truncate_retry = 20; /* The RMW will take around 50ms */
+retry:
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
@@ -2035,6 +2453,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
return PTR_ERR(req);
}
+ fill_fscrypt = false;
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
@@ -2050,6 +2469,43 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
}
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ if (cia && cia->fscrypt_auth) {
+ u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
+
+ if (len > sizeof(*cia->fscrypt_auth)) {
+ err = -EINVAL;
+ spin_unlock(&ci->i_ceph_lock);
+ goto out;
+ }
+
+ dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
+ ceph_vinop(inode), ci->fscrypt_auth_len, len);
+
+ /* It should never be re-set once set */
+ WARN_ON_ONCE(ci->fscrypt_auth);
+
+ if (issued & CEPH_CAP_AUTH_EXCL) {
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ kfree(ci->fscrypt_auth);
+ ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
+ ci->fscrypt_auth_len = len;
+ } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+ ci->fscrypt_auth_len != len ||
+ memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
+ req->r_fscrypt_auth = cia->fscrypt_auth;
+ mask |= CEPH_SETATTR_FSCRYPT_AUTH;
+ release |= CEPH_CAP_AUTH_SHARED;
+ }
+ cia->fscrypt_auth = NULL;
+ }
+#else
+ if (cia && cia->fscrypt_auth) {
+ err = -EINVAL;
+ spin_unlock(&ci->i_ceph_lock);
+ goto out;
+ }
+#endif /* CONFIG_FS_ENCRYPTION */
if (ia_valid & ATTR_UID) {
dout("setattr %p uid %d -> %d\n", inode,
@@ -2119,10 +2575,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
}
}
if (ia_valid & ATTR_SIZE) {
- loff_t isize = i_size_read(inode);
-
dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
- if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
+ /*
+ * Only when the new size is smaller and not aligned to
+ * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
+ */
+ if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
+ (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
+ mask |= CEPH_SETATTR_SIZE;
+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+ mask |= CEPH_SETATTR_FSCRYPT_FILE;
+ req->r_args.setattr.size =
+ cpu_to_le64(round_up(attr->ia_size,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_args.setattr.old_size =
+ cpu_to_le64(round_up(isize,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_fscrypt_file = attr->ia_size;
+ fill_fscrypt = true;
+ } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
if (attr->ia_size > isize) {
i_size_write(inode, attr->ia_size);
inode->i_blocks = calc_inode_blocks(attr->ia_size);
@@ -2132,11 +2605,24 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
}
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
attr->ia_size != isize) {
- req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
- req->r_args.setattr.old_size = cpu_to_le64(isize);
mask |= CEPH_SETATTR_SIZE;
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ if (IS_ENCRYPTED(inode) && attr->ia_size) {
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+ mask |= CEPH_SETATTR_FSCRYPT_FILE;
+ req->r_args.setattr.size =
+ cpu_to_le64(round_up(attr->ia_size,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_args.setattr.old_size =
+ cpu_to_le64(round_up(isize,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_fscrypt_file = attr->ia_size;
+ } else {
+ req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+ req->r_args.setattr.old_size = cpu_to_le64(isize);
+ req->r_fscrypt_file = 0;
+ }
}
}
if (ia_valid & ATTR_MTIME) {
@@ -2199,8 +2685,10 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
release &= issued;
spin_unlock(&ci->i_ceph_lock);
- if (lock_snap_rwsem)
+ if (lock_snap_rwsem) {
up_read(&mdsc->snap_rwsem);
+ lock_snap_rwsem = false;
+ }
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
@@ -2212,8 +2700,29 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
req->r_args.setattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1;
req->r_stamp = attr->ia_ctime;
+ if (fill_fscrypt) {
+ err = fill_fscrypt_truncate(inode, req, attr);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * The truncate request will return -EAGAIN when the
+ * last block has been updated just before the MDS
+ * successfully gets the xlock for the FILE lock. To
+ * avoid corrupting the file contents we need to retry
+ * it.
+ */
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err == -EAGAIN && truncate_retry--) {
+ dout("setattr %p result=%d (%s locally, %d remote), retry it!\n",
+ inode, err, ceph_cap_string(dirtied), mask);
+ ceph_mdsc_put_request(req);
+ ceph_free_cap_flush(prealloc_cf);
+ goto retry;
+ }
}
+out:
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
ceph_cap_string(dirtied), mask);
@@ -2242,6 +2751,10 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
+ err = fscrypt_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
+
err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
if (err != 0)
return err;
@@ -2254,7 +2767,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
return -EDQUOT;
- err = __ceph_setattr(inode, attr);
+ err = __ceph_setattr(inode, attr, NULL);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
@@ -2525,8 +3038,12 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->nlink = 1 + 1 + ci->i_subdirs;
}
- stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
+ if (IS_ENCRYPTED(inode))
+ stat->attributes |= STATX_ATTR_ENCRYPTED;
+ stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC |
+ STATX_ATTR_ENCRYPTED);
+
stat->result_mask = request_mask & valid_mask;
return err;
}