summaryrefslogtreecommitdiff
path: root/fs/ceph/addr.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-09-06 22:10:15 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-09-06 22:10:15 +0300
commit7ba2090ca64ea1aa435744884124387db1fac70f (patch)
treeed4ea24f4cfed5f28b9c8cdf99dbdf7df6a221ae /fs/ceph/addr.c
parent744a759492b5c57ff24a6e8aabe47b17ad8ee964 (diff)
parentce0d5bd3a6c176f9a3bf867624a07119dd4d0878 (diff)
downloadlinux-7ba2090ca64ea1aa435744884124387db1fac70f.tar.xz
Merge tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "Mixed with some fixes and cleanups, this brings in reasonably complete fscrypt support to CephFS! The list of things which don't work with encryption should be fairly short, mostly around the edges: fallocate (not supported well in CephFS to begin with), copy_file_range (requires re-encryption), non-default striping patterns. This was a multi-year effort principally by Jeff Layton with assistance from Xiubo Li, Luís Henriques and others, including several dependant changes in the MDS, netfs helper library and fscrypt framework itself" * tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client: (53 commits) ceph: make num_fwd and num_retry to __u32 ceph: make members in struct ceph_mds_request_args_ext a union rbd: use list_for_each_entry() helper libceph: do not include crypto/algapi.h ceph: switch ceph_lookup/atomic_open() to use new fscrypt helper ceph: fix updating i_truncate_pagecache_size for fscrypt ceph: wait for OSD requests' callbacks to finish when unmounting ceph: drop messages from MDS when unmounting ceph: update documentation regarding snapshot naming limitations ceph: prevent snapshot creation in encrypted locked directories ceph: add support for encrypted snapshot names ceph: invalidate pages when doing direct/sync writes ceph: plumb in decryption during reads ceph: add encryption support to writepage and writepages ceph: add read/modify/write to ceph_sync_write ceph: align data in pages in ceph_sync_write ceph: don't use special DIO path for encrypted inodes ceph: add truncate size handling support for fscrypt ceph: add object version support for sync read libceph: allow ceph_osdc_new_request to accept a multi-op read ...
Diffstat (limited to 'fs/ceph/addr.c')
-rw-r--r--fs/ceph/addr.c196
1 files changed, 155 insertions, 41 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 59cbfb80edbd..f4863078f7fe 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -18,6 +18,7 @@
#include "mds_client.h"
#include "cache.h"
#include "metric.h"
+#include "crypto.h"
#include <linux/ceph/osd_client.h>
#include <linux/ceph/striper.h>
@@ -242,11 +243,13 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
static void finish_netfs_read(struct ceph_osd_request *req)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
+ struct inode *inode = req->r_inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
struct netfs_io_subrequest *subreq = req->r_priv;
- int num_pages;
+ struct ceph_osd_req_op *op = &req->r_ops[0];
int err = req->r_result;
+ bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, osd_data->length, err);
@@ -260,14 +263,29 @@ static void finish_netfs_read(struct ceph_osd_request *req)
else if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
- if (err >= 0 && err < subreq->len)
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ if (err >= 0) {
+ if (sparse && err > 0)
+ err = ceph_sparse_ext_map_end(op);
+ if (err < subreq->len)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ if (IS_ENCRYPTED(inode) && err > 0) {
+ err = ceph_fscrypt_decrypt_extents(inode,
+ osd_data->pages, subreq->start,
+ op->extent.sparse_ext,
+ op->extent.sparse_ext_cnt);
+ if (err > subreq->len)
+ err = subreq->len;
+ }
+ }
+ if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+ ceph_put_page_vector(osd_data->pages,
+ calc_pages_for(osd_data->alignment,
+ osd_data->length), false);
+ }
netfs_subreq_terminated(subreq, err, false);
-
- num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
- ceph_put_page_vector(osd_data->pages, num_pages, false);
iput(req->r_inode);
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
@@ -334,10 +352,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct ceph_osd_request *req = NULL;
struct ceph_vino vino = ceph_vino(inode);
struct iov_iter iter;
- struct page **pages;
- size_t page_off;
int err = 0;
u64 len = subreq->len;
+ bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
+ u64 off = subreq->start;
if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
@@ -347,8 +365,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
return;
- req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
- 0, 1, CEPH_OSD_OP_READ,
+ ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
+ off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
if (IS_ERR(req)) {
@@ -357,20 +377,48 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
goto out;
}
+ if (sparse) {
+ err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
+ if (err)
+ goto out;
+ }
+
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
- err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
- if (err < 0) {
- dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
- goto out;
- }
- /* should always give us a page-aligned read */
- WARN_ON_ONCE(page_off);
- len = err;
- err = 0;
+ /*
+ * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
+ * encrypted inodes. We'd need infrastructure that handles an iov_iter
+ * instead of page arrays, and we don't have that as of yet. Once the
+ * dust settles on the write helpers and encrypt/decrypt routines for
+ * netfs, we should be able to rework this.
+ */
+ if (IS_ENCRYPTED(inode)) {
+ struct page **pages;
+ size_t page_off;
+
+ err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
+ if (err < 0) {
+ dout("%s: iov_ter_get_pages_alloc returned %d\n",
+ __func__, err);
+ goto out;
+ }
+
+ /* should always give us a page-aligned read */
+ WARN_ON_ONCE(page_off);
+ len = err;
+ err = 0;
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
+ false);
+ } else {
+ osd_req_op_extent_osd_iter(req, 0, &iter);
+ }
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ err = -EIO;
+ goto out;
+ }
req->r_callback = finish_netfs_read;
req->r_priv = subreq;
req->r_inode = inode;
@@ -571,10 +619,12 @@ static u64 get_writepages_data_length(struct inode *inode,
struct page *page, u64 start)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_snap_context *snapc = page_snap_context(page);
+ struct ceph_snap_context *snapc;
struct ceph_cap_snap *capsnap = NULL;
u64 end = i_size_read(inode);
+ u64 ret;
+ snapc = page_snap_context(ceph_fscrypt_pagecache_page(page));
if (snapc != ci->i_head_snapc) {
bool found = false;
spin_lock(&ci->i_ceph_lock);
@@ -589,9 +639,12 @@ static u64 get_writepages_data_length(struct inode *inode,
spin_unlock(&ci->i_ceph_lock);
WARN_ON(!found);
}
- if (end > page_offset(page) + thp_size(page))
- end = page_offset(page) + thp_size(page);
- return end > start ? end - start : 0;
+ if (end > ceph_fscrypt_page_offset(page) + thp_size(page))
+ end = ceph_fscrypt_page_offset(page) + thp_size(page);
+ ret = end > start ? end - start : 0;
+ if (ret && fscrypt_is_bounce_page(page))
+ ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE);
+ return ret;
}
/*
@@ -610,10 +663,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
loff_t page_off = page_offset(page);
int err;
loff_t len = thp_size(page);
+ loff_t wlen;
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
bool caching = ceph_is_cache_enabled(inode);
+ struct page *bounce_page = NULL;
dout("writepage %p idx %lu\n", page, page->index);
@@ -649,31 +704,51 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (ceph_wbc.i_size < page_off + len)
len = ceph_wbc.i_size - page_off;
+ wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
- inode, page, page->index, page_off, len, snapc, snapc->seq);
+ inode, page, page->index, page_off, wlen, snapc, snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
fsc->write_congested = true;
- req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
- ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
- true);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
+ page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE, snapc,
+ ceph_wbc.truncate_seq,
+ ceph_wbc.truncate_size, true);
if (IS_ERR(req)) {
redirty_page_for_writepage(wbc, page);
return PTR_ERR(req);
}
+ if (wlen < len)
+ len = wlen;
+
set_page_writeback(page);
if (caching)
ceph_set_page_fscache(page);
ceph_fscache_write_to_cache(inode, page_off, len, caching);
+ if (IS_ENCRYPTED(inode)) {
+ bounce_page = fscrypt_encrypt_pagecache_blocks(page,
+ CEPH_FSCRYPT_BLOCK_SIZE, 0,
+ GFP_NOFS);
+ if (IS_ERR(bounce_page)) {
+ redirty_page_for_writepage(wbc, page);
+ end_page_writeback(page);
+ ceph_osdc_put_request(req);
+ return PTR_ERR(bounce_page);
+ }
+ }
+
/* it may be a short write due to an object boundary */
WARN_ON_ONCE(len > thp_size(page));
- osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
- dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
+ osd_req_op_extent_osd_data_pages(req, 0,
+ bounce_page ? &bounce_page : &page, wlen, 0,
+ false, false);
+ dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n",
+ page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not ");
req->r_mtime = inode->i_mtime;
ceph_osdc_start_request(osdc, req);
@@ -681,7 +756,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
-
+ fscrypt_free_bounce_page(bounce_page);
ceph_osdc_put_request(req);
if (err == 0)
err = len;
@@ -800,6 +875,11 @@ static void writepages_finish(struct ceph_osd_request *req)
total_pages += num_pages;
for (j = 0; j < num_pages; j++) {
page = osd_data->pages[j];
+ if (fscrypt_is_bounce_page(page)) {
+ page = fscrypt_pagecache_page(page);
+ fscrypt_free_bounce_page(osd_data->pages[j]);
+ osd_data->pages[j] = page;
+ }
BUG_ON(!page);
WARN_ON(!PageUptodate(page));
@@ -835,6 +915,7 @@ static void writepages_finish(struct ceph_osd_request *req)
else
kfree(osd_data->pages);
ceph_osdc_put_request(req);
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
/*
@@ -1070,9 +1151,28 @@ get_more_pages:
fsc->mount_options->congestion_kb))
fsc->write_congested = true;
- pages[locked_pages++] = page;
- fbatch.folios[i] = NULL;
+ if (IS_ENCRYPTED(inode)) {
+ pages[locked_pages] =
+ fscrypt_encrypt_pagecache_blocks(page,
+ PAGE_SIZE, 0,
+ locked_pages ? GFP_NOWAIT : GFP_NOFS);
+ if (IS_ERR(pages[locked_pages])) {
+ if (PTR_ERR(pages[locked_pages]) == -EINVAL)
+ pr_err("%s: inode->i_blkbits=%hhu\n",
+ __func__, inode->i_blkbits);
+ /* better not fail on first page! */
+ BUG_ON(locked_pages == 0);
+ pages[locked_pages] = NULL;
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ break;
+ }
+ ++locked_pages;
+ } else {
+ pages[locked_pages++] = page;
+ }
+ fbatch.folios[i] = NULL;
len += thp_size(page);
}
@@ -1100,7 +1200,7 @@ get_more_pages:
}
new_request:
- offset = page_offset(pages[0]);
+ offset = ceph_fscrypt_page_offset(pages[0]);
len = wsize;
req = ceph_osdc_new_request(&fsc->client->osdc,
@@ -1121,9 +1221,13 @@ new_request:
ceph_wbc.truncate_size, true);
BUG_ON(IS_ERR(req));
}
- BUG_ON(len < page_offset(pages[locked_pages - 1]) +
- thp_size(page) - offset);
+ BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) +
+ thp_size(pages[locked_pages - 1]) - offset);
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ rc = -EIO;
+ goto release_folios;
+ }
req->r_callback = writepages_finish;
req->r_inode = inode;
@@ -1132,7 +1236,9 @@ new_request:
data_pages = pages;
op_idx = 0;
for (i = 0; i < locked_pages; i++) {
- u64 cur_offset = page_offset(pages[i]);
+ struct page *page = ceph_fscrypt_pagecache_page(pages[i]);
+
+ u64 cur_offset = page_offset(page);
/*
* Discontinuity in page range? Ceph can handle that by just passing
* multiple extents in the write op.
@@ -1161,9 +1267,9 @@ new_request:
op_idx++;
}
- set_page_writeback(pages[i]);
+ set_page_writeback(page);
if (caching)
- ceph_set_page_fscache(pages[i]);
+ ceph_set_page_fscache(page);
len += thp_size(page);
}
ceph_fscache_write_to_cache(inode, offset, len, caching);
@@ -1179,8 +1285,16 @@ new_request:
offset);
len = max(len, min_len);
}
+ if (IS_ENCRYPTED(inode))
+ len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
+
dout("writepages got pages at %llu~%llu\n", offset, len);
+ if (IS_ENCRYPTED(inode) &&
+ ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
+ pr_warn("%s: bad encrypted write offset=%lld len=%llu\n",
+ __func__, offset, len);
+
osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
0, from_pool, false);
osd_req_op_extent_update(req, op_idx, len);