diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-09-06 22:10:15 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-09-06 22:10:15 +0300 |
commit | 7ba2090ca64ea1aa435744884124387db1fac70f (patch) | |
tree | ed4ea24f4cfed5f28b9c8cdf99dbdf7df6a221ae /fs/ceph/addr.c | |
parent | 744a759492b5c57ff24a6e8aabe47b17ad8ee964 (diff) | |
parent | ce0d5bd3a6c176f9a3bf867624a07119dd4d0878 (diff) | |
download | linux-7ba2090ca64ea1aa435744884124387db1fac70f.tar.xz |
Merge tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov:
"Mixed with some fixes and cleanups, this brings in reasonably complete
fscrypt support to CephFS! The list of things which don't work with
encryption should be fairly short, mostly around the edges: fallocate
(not supported well in CephFS to begin with), copy_file_range
(requires re-encryption), non-default striping patterns.
This was a multi-year effort principally by Jeff Layton with
assistance from Xiubo Li, Luís Henriques and others, including several
dependant changes in the MDS, netfs helper library and fscrypt
framework itself"
* tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client: (53 commits)
ceph: make num_fwd and num_retry to __u32
ceph: make members in struct ceph_mds_request_args_ext a union
rbd: use list_for_each_entry() helper
libceph: do not include crypto/algapi.h
ceph: switch ceph_lookup/atomic_open() to use new fscrypt helper
ceph: fix updating i_truncate_pagecache_size for fscrypt
ceph: wait for OSD requests' callbacks to finish when unmounting
ceph: drop messages from MDS when unmounting
ceph: update documentation regarding snapshot naming limitations
ceph: prevent snapshot creation in encrypted locked directories
ceph: add support for encrypted snapshot names
ceph: invalidate pages when doing direct/sync writes
ceph: plumb in decryption during reads
ceph: add encryption support to writepage and writepages
ceph: add read/modify/write to ceph_sync_write
ceph: align data in pages in ceph_sync_write
ceph: don't use special DIO path for encrypted inodes
ceph: add truncate size handling support for fscrypt
ceph: add object version support for sync read
libceph: allow ceph_osdc_new_request to accept a multi-op read
...
Diffstat (limited to 'fs/ceph/addr.c')
-rw-r--r-- | fs/ceph/addr.c | 196 |
1 files changed, 155 insertions, 41 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 59cbfb80edbd..f4863078f7fe 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -18,6 +18,7 @@ #include "mds_client.h" #include "cache.h" #include "metric.h" +#include "crypto.h" #include <linux/ceph/osd_client.h> #include <linux/ceph/striper.h> @@ -242,11 +243,13 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) static void finish_netfs_read(struct ceph_osd_request *req) { - struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); + struct inode *inode = req->r_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); struct netfs_io_subrequest *subreq = req->r_priv; - int num_pages; + struct ceph_osd_req_op *op = &req->r_ops[0]; int err = req->r_result; + bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, osd_data->length, err); @@ -260,14 +263,29 @@ static void finish_netfs_read(struct ceph_osd_request *req) else if (err == -EBLOCKLISTED) fsc->blocklisted = true; - if (err >= 0 && err < subreq->len) - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (err >= 0) { + if (sparse && err > 0) + err = ceph_sparse_ext_map_end(op); + if (err < subreq->len) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (IS_ENCRYPTED(inode) && err > 0) { + err = ceph_fscrypt_decrypt_extents(inode, + osd_data->pages, subreq->start, + op->extent.sparse_ext, + op->extent.sparse_ext_cnt); + if (err > subreq->len) + err = subreq->len; + } + } + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + ceph_put_page_vector(osd_data->pages, + calc_pages_for(osd_data->alignment, + osd_data->length), false); + } netfs_subreq_terminated(subreq, err, false); - - num_pages = calc_pages_for(osd_data->alignment, osd_data->length); - ceph_put_page_vector(osd_data->pages, num_pages, false); iput(req->r_inode); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) @@ -334,10 +352,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; - struct page **pages; - size_t page_off; int err = 0; u64 len = subreq->len; + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 off = subreq->start; if (ceph_inode_is_shutdown(inode)) { err = -EIO; @@ -347,8 +365,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, - 0, 1, CEPH_OSD_OP_READ, + ceph_fscrypt_adjust_off_and_len(inode, &off, &len); + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, + off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -357,20 +377,48 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) goto out; } + if (sparse) { + err = ceph_alloc_sparse_ext_map(&req->r_ops[0]); + if (err) + goto out; + } + dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); - if (err < 0) { - dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); - goto out; - } - /* should always give us a page-aligned read */ - WARN_ON_ONCE(page_off); - len = err; - err = 0; + /* + * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for + * encrypted inodes. We'd need infrastructure that handles an iov_iter + * instead of page arrays, and we don't have that as of yet. Once the + * dust settles on the write helpers and encrypt/decrypt routines for + * netfs, we should be able to rework this. + */ + if (IS_ENCRYPTED(inode)) { + struct page **pages; + size_t page_off; + + err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); + if (err < 0) { + dout("%s: iov_ter_get_pages_alloc returned %d\n", + __func__, err); + goto out; + } + + /* should always give us a page-aligned read */ + WARN_ON_ONCE(page_off); + len = err; + err = 0; - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, + false); + } else { + osd_req_op_extent_osd_iter(req, 0, &iter); + } + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + err = -EIO; + goto out; + } req->r_callback = finish_netfs_read; req->r_priv = subreq; req->r_inode = inode; @@ -571,10 +619,12 @@ static u64 get_writepages_data_length(struct inode *inode, struct page *page, u64 start) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_snap_context *snapc; struct ceph_cap_snap *capsnap = NULL; u64 end = i_size_read(inode); + u64 ret; + snapc = page_snap_context(ceph_fscrypt_pagecache_page(page)); if (snapc != ci->i_head_snapc) { bool found = false; spin_lock(&ci->i_ceph_lock); @@ -589,9 +639,12 @@ static u64 get_writepages_data_length(struct inode *inode, spin_unlock(&ci->i_ceph_lock); WARN_ON(!found); } - if (end > page_offset(page) + thp_size(page)) - end = page_offset(page) + thp_size(page); - return end > start ? end - start : 0; + if (end > ceph_fscrypt_page_offset(page) + thp_size(page)) + end = ceph_fscrypt_page_offset(page) + thp_size(page); + ret = end > start ? end - start : 0; + if (ret && fscrypt_is_bounce_page(page)) + ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE); + return ret; } /* @@ -610,10 +663,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) loff_t page_off = page_offset(page); int err; loff_t len = thp_size(page); + loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; bool caching = ceph_is_cache_enabled(inode); + struct page *bounce_page = NULL; dout("writepage %p idx %lu\n", page, page->index); @@ -649,31 +704,51 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (ceph_wbc.i_size < page_off + len) len = ceph_wbc.i_size - page_off; + wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", - inode, page, page->index, page_off, len, snapc, snapc->seq); + inode, page, page->index, page_off, wlen, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) fsc->write_congested = true; - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, - ceph_wbc.truncate_seq, ceph_wbc.truncate_size, - true); + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), + page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, snapc, + ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); if (IS_ERR(req)) { redirty_page_for_writepage(wbc, page); return PTR_ERR(req); } + if (wlen < len) + len = wlen; + set_page_writeback(page); if (caching) ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching); + if (IS_ENCRYPTED(inode)) { + bounce_page = fscrypt_encrypt_pagecache_blocks(page, + CEPH_FSCRYPT_BLOCK_SIZE, 0, + GFP_NOFS); + if (IS_ERR(bounce_page)) { + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + ceph_osdc_put_request(req); + return PTR_ERR(bounce_page); + } + } + /* it may be a short write due to an object boundary */ WARN_ON_ONCE(len > thp_size(page)); - osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); - dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); + osd_req_op_extent_osd_data_pages(req, 0, + bounce_page ? &bounce_page : &page, wlen, 0, + false, false); + dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n", + page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not "); req->r_mtime = inode->i_mtime; ceph_osdc_start_request(osdc, req); @@ -681,7 +756,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); - + fscrypt_free_bounce_page(bounce_page); ceph_osdc_put_request(req); if (err == 0) err = len; @@ -800,6 +875,11 @@ static void writepages_finish(struct ceph_osd_request *req) total_pages += num_pages; for (j = 0; j < num_pages; j++) { page = osd_data->pages[j]; + if (fscrypt_is_bounce_page(page)) { + page = fscrypt_pagecache_page(page); + fscrypt_free_bounce_page(osd_data->pages[j]); + osd_data->pages[j] = page; + } BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -835,6 +915,7 @@ static void writepages_finish(struct ceph_osd_request *req) else kfree(osd_data->pages); ceph_osdc_put_request(req); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } /* @@ -1070,9 +1151,28 @@ get_more_pages: fsc->mount_options->congestion_kb)) fsc->write_congested = true; - pages[locked_pages++] = page; - fbatch.folios[i] = NULL; + if (IS_ENCRYPTED(inode)) { + pages[locked_pages] = + fscrypt_encrypt_pagecache_blocks(page, + PAGE_SIZE, 0, + locked_pages ? GFP_NOWAIT : GFP_NOFS); + if (IS_ERR(pages[locked_pages])) { + if (PTR_ERR(pages[locked_pages]) == -EINVAL) + pr_err("%s: inode->i_blkbits=%hhu\n", + __func__, inode->i_blkbits); + /* better not fail on first page! */ + BUG_ON(locked_pages == 0); + pages[locked_pages] = NULL; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + break; + } + ++locked_pages; + } else { + pages[locked_pages++] = page; + } + fbatch.folios[i] = NULL; len += thp_size(page); } @@ -1100,7 +1200,7 @@ get_more_pages: } new_request: - offset = page_offset(pages[0]); + offset = ceph_fscrypt_page_offset(pages[0]); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, @@ -1121,9 +1221,13 @@ new_request: ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } - BUG_ON(len < page_offset(pages[locked_pages - 1]) + - thp_size(page) - offset); + BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + + thp_size(pages[locked_pages - 1]) - offset); + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto release_folios; + } req->r_callback = writepages_finish; req->r_inode = inode; @@ -1132,7 +1236,9 @@ new_request: data_pages = pages; op_idx = 0; for (i = 0; i < locked_pages; i++) { - u64 cur_offset = page_offset(pages[i]); + struct page *page = ceph_fscrypt_pagecache_page(pages[i]); + + u64 cur_offset = page_offset(page); /* * Discontinuity in page range? Ceph can handle that by just passing * multiple extents in the write op. @@ -1161,9 +1267,9 @@ new_request: op_idx++; } - set_page_writeback(pages[i]); + set_page_writeback(page); if (caching) - ceph_set_page_fscache(pages[i]); + ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); @@ -1179,8 +1285,16 @@ new_request: offset); len = max(len, min_len); } + if (IS_ENCRYPTED(inode)) + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + dout("writepages got pages at %llu~%llu\n", offset, len); + if (IS_ENCRYPTED(inode) && + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) + pr_warn("%s: bad encrypted write offset=%lld len=%llu\n", + __func__, offset, len); + osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, from_pool, false); osd_req_op_extent_update(req, op_idx, len); |