diff options
author | Jakub Kicinski <kuba@kernel.org> | 2024-01-26 01:00:54 +0300 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2024-01-26 01:20:08 +0300 |
commit | 06f609b3119876b42f19fdb690b10896d3c648e6 (patch) | |
tree | 2f42b892152af80d299133ef6c902261a1fd0b98 /fs | |
parent | 91374ba537bd60caa9ae052c9f1c0fe055b39149 (diff) | |
parent | ecb1b8288dc7ccbdcb3b9df005fa1c0e0c0388a7 (diff) | |
download | linux-06f609b3119876b42f19fdb690b10896d3c648e6.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR.
No conflicts or adjacent changes.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'fs')
194 files changed, 6181 insertions, 4060 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 731e3d14b67d..0e8418066a48 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -42,6 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_free_inode(struct inode *inode); struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev); +void v9fs_set_netfs_context(struct inode *inode); int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t rdev); void v9fs_evict_inode(struct inode *inode); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 8a635999a7d6..047855033d32 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -19,12 +19,45 @@ #include <linux/netfs.h> #include <net/9p/9p.h> #include <net/9p/client.h> +#include <trace/events/netfs.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "cache.h" #include "fid.h" +static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq) +{ + struct p9_fid *fid = subreq->rreq->netfs_priv; + int err, len; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + netfs_write_subrequest_terminated(subreq, len ?: err, false); +} + +static void v9fs_upload_to_server_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); + + v9fs_upload_to_server(subreq); +} + +/* + * Set up write requests for a writeback slice. We need to add a write request + * for each write we want to make. + */ +static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER, + start, len, v9fs_upload_to_server_worker); + if (subreq) + netfs_queue_write_request(subreq); +} + /** * v9fs_issue_read - Issue a read from 9P * @subreq: The read to make @@ -33,14 +66,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct p9_fid *fid = rreq->netfs_priv; - struct iov_iter to; - loff_t pos = subreq->start + subreq->transferred; - size_t len = subreq->len - subreq->transferred; int total, err; - iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len); - - total = p9_client_read(fid, pos, &to, &err); + total = p9_client_read(fid, subreq->start + subreq->transferred, + &subreq->io_iter, &err); /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ @@ -50,25 +79,42 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) } /** - * v9fs_init_request - Initialise a read request + * v9fs_init_request - Initialise a request * @rreq: The read request * @file: The file being read from */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { - struct p9_fid *fid = file->private_data; - - BUG_ON(!fid); + struct p9_fid *fid; + bool writing = (rreq->origin == NETFS_READ_FOR_WRITE || + rreq->origin == NETFS_WRITEBACK || + rreq->origin == NETFS_WRITETHROUGH || + rreq->origin == NETFS_LAUNDER_WRITE || + rreq->origin == NETFS_UNBUFFERED_WRITE || + rreq->origin == NETFS_DIO_WRITE); + + if (file) { + fid = file->private_data; + if (!fid) + goto no_fid; + p9_fid_get(fid); + } else { + fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true); + if (!fid) + goto no_fid; + } /* we might need to read from a fid that was opened write-only * for read-modify-write of page cache, use the writeback fid * for that */ - WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && - !(fid->mode & P9_ORDWR)); - - p9_fid_get(fid); + WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR)); rreq->netfs_priv = fid; return 0; + +no_fid: + WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n", + rreq->inode->i_ino); + return -EINVAL; } /** @@ -82,281 +128,20 @@ static void v9fs_free_request(struct netfs_io_request *rreq) p9_fid_put(fid); } -/** - * v9fs_begin_cache_operation - Begin a cache operation for a read - * @rreq: The read request - */ -static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) -{ -#ifdef CONFIG_9P_FSCACHE - struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); - - return fscache_begin_read_operation(&rreq->cache_resources, cookie); -#else - return -ENOBUFS; -#endif -} - const struct netfs_request_ops v9fs_req_ops = { .init_request = v9fs_init_request, .free_request = v9fs_free_request, - .begin_cache_operation = v9fs_begin_cache_operation, .issue_read = v9fs_issue_read, + .create_write_requests = v9fs_create_write_requests, }; -/** - * v9fs_release_folio - release the private state associated with a folio - * @folio: The folio to be released - * @gfp: The caller's allocation restrictions - * - * Returns true if the page can be released, false otherwise. - */ - -static bool v9fs_release_folio(struct folio *folio, gfp_t gfp) -{ - if (folio_test_private(folio)) - return false; -#ifdef CONFIG_9P_FSCACHE - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio)))); -#endif - return true; -} - -static void v9fs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) -{ - folio_wait_fscache(folio); -} - -#ifdef CONFIG_9P_FSCACHE -static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct v9fs_inode *v9inode = priv; - __le32 version; - - if (IS_ERR_VALUE(transferred_or_error) && - transferred_or_error != -ENOBUFS) { - version = cpu_to_le32(v9inode->qid.version); - fscache_invalidate(v9fs_inode_cookie(v9inode), &version, - i_size_read(&v9inode->netfs.inode), 0); - } -} -#endif - -static int v9fs_vfs_write_folio_locked(struct folio *folio) -{ - struct inode *inode = folio_inode(folio); - loff_t start = folio_pos(folio); - loff_t i_size = i_size_read(inode); - struct iov_iter from; - size_t len = folio_size(folio); - struct p9_fid *writeback_fid; - int err; - struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); - struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode); - - if (start >= i_size) - return 0; /* Simultaneous truncation occurred */ - - len = min_t(loff_t, i_size - start, len); - - iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len); - - writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true); - if (!writeback_fid) { - WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n", - inode->i_private); - return -EINVAL; - } - - folio_wait_fscache(folio); - folio_start_writeback(folio); - - p9_client_write(writeback_fid, start, &from, &err); - -#ifdef CONFIG_9P_FSCACHE - if (err == 0 && - fscache_cookie_enabled(cookie) && - test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { - folio_start_fscache(folio); - fscache_write_to_cache(v9fs_inode_cookie(v9inode), - folio_mapping(folio), start, len, i_size, - v9fs_write_to_cache_done, v9inode, - true); - } -#endif - - folio_end_writeback(folio); - p9_fid_put(writeback_fid); - - return err; -} - -static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int retval; - - p9_debug(P9_DEBUG_VFS, "folio %p\n", folio); - - retval = v9fs_vfs_write_folio_locked(folio); - if (retval < 0) { - if (retval == -EAGAIN) { - folio_redirty_for_writepage(wbc, folio); - retval = 0; - } else { - mapping_set_error(folio_mapping(folio), retval); - } - } else - retval = 0; - - folio_unlock(folio); - return retval; -} - -static int v9fs_launder_folio(struct folio *folio) -{ - int retval; - - if (folio_clear_dirty_for_io(folio)) { - retval = v9fs_vfs_write_folio_locked(folio); - if (retval) - return retval; - } - folio_wait_fscache(folio); - return 0; -} - -/** - * v9fs_direct_IO - 9P address space operation for direct I/O - * @iocb: target I/O control block - * @iter: The data/buffer to use - * - * The presence of v9fs_direct_IO() in the address space ops vector - * allowes open() O_DIRECT flags which would have failed otherwise. - * - * In the non-cached mode, we shunt off direct read and write requests before - * the VFS gets them, so this method should never be called. - * - * Direct IO is not 'yet' supported in the cached mode. Hence when - * this routine is called through generic_file_aio_read(), the read/write fails - * with an error. - * - */ -static ssize_t -v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; - ssize_t n; - int err = 0; - - if (iov_iter_rw(iter) == WRITE) { - n = p9_client_write(file->private_data, pos, iter, &err); - if (n) { - struct inode *inode = file_inode(file); - loff_t i_size = i_size_read(inode); - - if (pos + n > i_size) - inode_add_bytes(inode, pos + n - i_size); - } - } else { - n = p9_client_read(file->private_data, pos, iter, &err); - } - return n ? n : err; -} - -static int v9fs_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, - struct page **subpagep, void **fsdata) -{ - int retval; - struct folio *folio; - struct v9fs_inode *v9inode = V9FS_I(mapping->host); - - p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - - /* Prefetch area to be written into the cache if we're caching this - * file. We need to do this before we get a lock on the page in case - * there's more than one writer competing for the same cache block. - */ - retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata); - if (retval < 0) - return retval; - - *subpagep = &folio->page; - return retval; -} - -static int v9fs_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, - struct page *subpage, void *fsdata) -{ - loff_t last_pos = pos + copied; - struct folio *folio = page_folio(subpage); - struct inode *inode = mapping->host; - - p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - - if (!folio_test_uptodate(folio)) { - if (unlikely(copied < len)) { - copied = 0; - goto out; - } - - folio_mark_uptodate(folio); - } - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_mutex. - */ - if (last_pos > inode->i_size) { - inode_add_bytes(inode, last_pos - inode->i_size); - i_size_write(inode, last_pos); -#ifdef CONFIG_9P_FSCACHE - fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL, - &last_pos); -#endif - } - folio_mark_dirty(folio); -out: - folio_unlock(folio); - folio_put(folio); - - return copied; -} - -#ifdef CONFIG_9P_FSCACHE -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - struct v9fs_inode *v9inode = V9FS_I(mapping->host); - - return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode)); -} -#else -#define v9fs_dirty_folio filemap_dirty_folio -#endif - const struct address_space_operations v9fs_addr_operations = { - .read_folio = netfs_read_folio, - .readahead = netfs_readahead, - .dirty_folio = v9fs_dirty_folio, - .writepage = v9fs_vfs_writepage, - .write_begin = v9fs_write_begin, - .write_end = v9fs_write_end, - .release_folio = v9fs_release_folio, - .invalidate_folio = v9fs_invalidate_folio, - .launder_folio = v9fs_launder_folio, - .direct_IO = v9fs_direct_IO, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, + .dirty_folio = netfs_dirty_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, + .launder_folio = netfs_launder_folio, + .direct_IO = noop_direct_IO, + .writepages = netfs_writepages, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 11cd8d23f6f2..bae330c2f0cf 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -353,25 +353,15 @@ static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct p9_fid *fid = iocb->ki_filp->private_data; - int ret, err = 0; p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", fid->fid, iov_iter_count(to), iocb->ki_pos); - if (!(fid->mode & P9L_DIRECT)) { - p9_debug(P9_DEBUG_VFS, "(cached)\n"); - return generic_file_read_iter(iocb, to); - } - - if (iocb->ki_filp->f_flags & O_NONBLOCK) - ret = p9_client_read_once(fid, iocb->ki_pos, to, &err); - else - ret = p9_client_read(fid, iocb->ki_pos, to, &err); - if (!ret) - return err; + if (fid->mode & P9L_DIRECT) + return netfs_unbuffered_read_iter(iocb, to); - iocb->ki_pos += ret; - return ret; + p9_debug(P9_DEBUG_VFS, "(cached)\n"); + return netfs_file_read_iter(iocb, to); } /* @@ -407,46 +397,14 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct p9_fid *fid = file->private_data; - ssize_t retval; - loff_t origin; - int err = 0; p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid); - if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) { - p9_debug(P9_DEBUG_CACHE, "(cached)\n"); - return generic_file_write_iter(iocb, from); - } + if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE)) + return netfs_unbuffered_write_iter(iocb, from); - retval = generic_write_checks(iocb, from); - if (retval <= 0) - return retval; - - origin = iocb->ki_pos; - retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err); - if (retval > 0) { - struct inode *inode = file_inode(file); - loff_t i_size; - unsigned long pg_start, pg_end; - - pg_start = origin >> PAGE_SHIFT; - pg_end = (origin + retval - 1) >> PAGE_SHIFT; - if (inode->i_mapping && inode->i_mapping->nrpages) - invalidate_inode_pages2_range(inode->i_mapping, - pg_start, pg_end); - iocb->ki_pos += retval; - i_size = i_size_read(inode); - if (iocb->ki_pos > i_size) { - inode_add_bytes(inode, iocb->ki_pos - i_size); - /* - * Need to serialize against i_size_write() in - * v9fs_stat2inode() - */ - v9fs_i_size_write(inode, iocb->ki_pos); - } - return retval; - } - return err; + p9_debug(P9_DEBUG_CACHE, "(cached)\n"); + return netfs_file_write_iter(iocb, from); } static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, @@ -519,36 +477,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) static vm_fault_t v9fs_vm_page_mkwrite(struct vm_fault *vmf) { - struct folio *folio = page_folio(vmf->page); - struct file *filp = vmf->vma->vm_file; - struct inode *inode = file_inode(filp); - - - p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n", - folio, (unsigned long)filp->private_data); - - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. - */ -#ifdef CONFIG_9P_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) - return VM_FAULT_NOPAGE; -#endif - - /* Update file times before taking page lock */ - file_update_time(filp); - - if (folio_lock_killable(folio) < 0) - return VM_FAULT_RETRY; - if (folio_mapping(folio) != inode->i_mapping) - goto out_unlock; - folio_wait_stable(folio); - - return VM_FAULT_LOCKED; -out_unlock: - folio_unlock(folio); - return VM_FAULT_NOPAGE; + return netfs_page_mkwrite(vmf, NULL); } static void v9fs_mmap_vm_close(struct vm_area_struct *vma) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b845ee18a80b..32572982f72e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -246,10 +246,10 @@ void v9fs_free_inode(struct inode *inode) /* * Set parameters for the netfs library */ -static void v9fs_set_netfs_context(struct inode *inode) +void v9fs_set_netfs_context(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); - netfs_inode_init(&v9inode->netfs, &v9fs_req_ops); + netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true); } int v9fs_init_inode(struct v9fs_session_info *v9ses, @@ -326,8 +326,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, err = -EINVAL; goto error; } - - v9fs_set_netfs_context(inode); error: return err; @@ -359,6 +357,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) iput(inode); return ERR_PTR(err); } + v9fs_set_netfs_context(inode); return inode; } @@ -374,11 +373,8 @@ void v9fs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); -#ifdef CONFIG_9P_FSCACHE version = cpu_to_le32(v9inode->qid.version); - fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, - &version); -#endif + netfs_clear_inode_writeback(inode, &version); clear_inode(inode); filemap_fdatawrite(&inode->i_data); @@ -464,6 +460,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, goto error; v9fs_stat2inode(st, inode, sb, 0); + v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; @@ -1113,7 +1110,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - truncate_pagecache(inode, iattr->ia_size); + netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) { @@ -1181,6 +1178,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; + v9inode->netfs.remote_i_size = stat->length; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index c7319af2f471..3505227e1704 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -128,6 +128,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, goto error; v9fs_stat2inode_dotl(st, inode, 0); + v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) @@ -598,7 +599,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - truncate_pagecache(inode, iattr->ia_size); + netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) @@ -655,6 +656,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; + v9inode->netfs.remote_i_size = stat->st_size; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; @@ -683,8 +685,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_mode = mode; } if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && - stat->st_result_mask & P9_STATS_SIZE) + stat->st_result_mask & P9_STATS_SIZE) { + v9inode->netfs.remote_i_size = stat->st_size; v9fs_i_size_write(inode, stat->st_size); + } if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 73db55c050bf..941f7d0e0bfa 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -289,31 +289,21 @@ static int v9fs_drop_inode(struct inode *inode) static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct v9fs_inode *v9inode; - /* * send an fsync request to server irrespective of * wbc->sync_mode. */ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - - v9inode = V9FS_I(inode); - fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); - - return 0; + return netfs_unpin_writeback(inode, wbc); } static int v9fs_write_inode_dotl(struct inode *inode, struct writeback_control *wbc) { - struct v9fs_inode *v9inode; - v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); - - return 0; + return netfs_unpin_writeback(inode, wbc); } static const struct super_operations v9fs_super_ops = { diff --git a/fs/Kconfig b/fs/Kconfig index a3159831ba98..89fdbefd1075 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -144,7 +144,6 @@ source "fs/overlayfs/Kconfig" menu "Caches" source "fs/netfs/Kconfig" -source "fs/fscache/Kconfig" source "fs/cachefiles/Kconfig" endmenu diff --git a/fs/Makefile b/fs/Makefile index a6962c588962..c09016257f05 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -61,7 +61,6 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_NETFS_SUPPORT) += netfs/ -obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT4_FS) += ext4/ # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the diff --git a/fs/afs/dir.c b/fs/afs/dir.c index c14533ef108f..b5b8de521f99 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req) if (xas_retry(&xas, folio)) continue; BUG_ON(xa_is_value(folio)); - ASSERTCMP(folio_file_mapping(folio), ==, mapping); + ASSERTCMP(folio->mapping, ==, mapping); folio_put(folio); } @@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio)); for (offset = 0; offset < size; offset += sizeof(*block)) { block = kmap_local_folio(folio, offset); - pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block); + pr_warn("[%02lx] %32phN\n", folio->index + offset, block); kunmap_local(block); } } @@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) { afs_dir_dump(dvnode, req); @@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, continue; } + /* Don't expose silly rename entries to userspace. */ + if (nlen > 6 && + dire->u.name[0] == '.' && + ctx->actor != afs_lookup_filldir && + ctx->actor != afs_lookup_one_filldir && + memcmp(dire->u.name, ".__afs", 6) == 0) + continue; + /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), @@ -708,6 +716,8 @@ static void afs_do_lookup_success(struct afs_operation *op) break; } + if (vp->scb.status.abort_code) + trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code); if (!vp->scb.have_status && !vp->scb.have_error) continue; @@ -897,12 +907,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - inode = ERR_PTR(afs_op_error(op)); out_op: if (!afs_op_error(op)) { - inode = &op->file[1].vnode->netfs.inode; - op->file[1].vnode = NULL; + if (op->file[1].scb.status.abort_code) { + afs_op_accumulate_error(op, -ECONNABORTED, + op->file[1].scb.status.abort_code); + } else { + inode = &op->file[1].vnode->netfs.inode; + op->file[1].vnode = NULL; + } } if (op->file[0].scb.have_status) @@ -2022,7 +2036,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags) { struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio)); + _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index); folio_detach_private(folio); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 2cd40ba601f1..c4d2711e20ad 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); - netfs_inode_init(&vnode->netfs, NULL); + netfs_inode_init(&vnode->netfs, NULL, false); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { @@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; -/* - * Dirs in the dynamic root don't need revalidation. - */ -static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - return 1; -} - const struct dentry_operations afs_dynroot_dentry_operations = { - .d_revalidate = afs_dynroot_d_revalidate, .d_delete = always_delete_dentry, .d_release = afs_d_release, .d_automount = afs_d_automount, diff --git a/fs/afs/file.c b/fs/afs/file.c index 30914e0d9cb2..3d33b221d9ca 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -20,9 +20,6 @@ static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); static int afs_symlink_read_folio(struct file *file, struct folio *folio); -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length); -static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, @@ -37,7 +34,7 @@ const struct file_operations afs_file_operations = { .release = afs_release, .llseek = generic_file_llseek, .read_iter = afs_file_read_iter, - .write_iter = afs_file_write, + .write_iter = netfs_file_write_iter, .mmap = afs_file_mmap, .splice_read = afs_file_splice_read, .splice_write = iter_file_splice_write, @@ -53,22 +50,21 @@ const struct inode_operations afs_file_inode_operations = { }; const struct address_space_operations afs_file_aops = { + .direct_IO = noop_direct_IO, .read_folio = netfs_read_folio, .readahead = netfs_readahead, - .dirty_folio = afs_dirty_folio, - .launder_folio = afs_launder_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, - .write_begin = afs_write_begin, - .write_end = afs_write_end, - .writepages = afs_writepages, + .dirty_folio = netfs_dirty_folio, + .launder_folio = netfs_launder_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .migrate_folio = filemap_migrate_folio, + .writepages = afs_writepages, }; const struct address_space_operations afs_symlink_aops = { .read_folio = afs_symlink_read_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .migrate_folio = filemap_migrate_folio, }; @@ -323,11 +319,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->len = subreq->len - subreq->transferred; fsreq->key = key_get(subreq->rreq->netfs_priv); fsreq->vnode = vnode; - fsreq->iter = &fsreq->def_iter; - - iov_iter_xarray(&fsreq->def_iter, ITER_DEST, - &fsreq->vnode->netfs.inode.i_mapping->i_pages, - fsreq->pos, fsreq->len); + fsreq->iter = &subreq->io_iter; afs_fetch_data(fsreq->vnode, fsreq); afs_put_read(fsreq); @@ -359,22 +351,13 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio) static int afs_init_request(struct netfs_io_request *rreq, struct file *file) { - rreq->netfs_priv = key_get(afs_file_key(file)); + if (file) + rreq->netfs_priv = key_get(afs_file_key(file)); + rreq->rsize = 256 * 1024; + rreq->wsize = 256 * 1024; return 0; } -static int afs_begin_cache_operation(struct netfs_io_request *rreq) -{ -#ifdef CONFIG_AFS_FSCACHE - struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - - return fscache_begin_read_operation(&rreq->cache_resources, - afs_vnode_cache(vnode)); -#else - return -ENOBUFS; -#endif -} - static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, struct folio **foliop, void **_fsdata) { @@ -388,128 +371,37 @@ static void afs_free_request(struct netfs_io_request *rreq) key_put(rreq->netfs_priv); } -const struct netfs_request_ops afs_req_ops = { - .init_request = afs_init_request, - .free_request = afs_free_request, - .begin_cache_operation = afs_begin_cache_operation, - .check_write_begin = afs_check_write_begin, - .issue_read = afs_issue_read, -}; - -int afs_write_inode(struct inode *inode, struct writeback_control *wbc) +static void afs_update_i_size(struct inode *inode, loff_t new_i_size) { - fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); - return 0; -} - -/* - * Adjust the dirty region of the page on truncation or full invalidation, - * getting rid of the markers altogether if the region is entirely invalidated. - */ -static void afs_invalidate_dirty(struct folio *folio, size_t offset, - size_t length) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - unsigned long priv; - unsigned int f, t, end = offset + length; - - priv = (unsigned long)folio_get_private(folio); - - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == folio_size(folio)) - goto full_invalidate; - - /* If the page was dirtied by page_mkwrite(), the PTE stays writable - * and we don't get another notification to tell us to expand it - * again. - */ - if (afs_is_folio_dirty_mmapped(priv)) - return; - - /* We may need to shorten the dirty region */ - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - - if (t <= offset || f >= end) - return; /* Doesn't overlap */ - - if (f < offset && t > end) - return; /* Splits the dirty region - just absorb it */ - - if (f >= offset && t <= end) - goto undirty; + struct afs_vnode *vnode = AFS_FS_I(inode); + loff_t i_size; - if (f < offset) - t = offset; - else - f = end; - if (f == t) - goto undirty; - - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio); - return; - -undirty: - trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio); - folio_clear_dirty_for_io(folio); -full_invalidate: - trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio); - folio_detach_private(folio); + write_seqlock(&vnode->cb_lock); + i_size = i_size_read(&vnode->netfs.inode); + if (new_i_size > i_size) { + i_size_write(&vnode->netfs.inode, new_i_size); + inode_set_bytes(&vnode->netfs.inode, new_i_size); + } + write_sequnlock(&vnode->cb_lock); + fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size); } -/* - * invalidate part or all of a page - * - release a page and clean up its private data if offset is 0 (indicating - * the entire page) - */ -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) +static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq) { - _enter("{%lu},%zu,%zu", folio->index, offset, length); - - BUG_ON(!folio_test_locked(folio)); + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); - if (folio_get_private(folio)) - afs_invalidate_dirty(folio, offset, length); - - folio_wait_fscache(folio); - _leave(""); + afs_invalidate_cache(vnode, 0); } -/* - * release a page and clean up its private state if it's not busy - * - return true if the page can now be released, false if not - */ -static bool afs_release_folio(struct folio *folio, gfp_t gfp) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - - _enter("{{%llx:%llu}[%lu],%lx},%x", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, - gfp); - - /* deny if folio is being written to the cache and the caller hasn't - * elected to wait */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(afs_vnode_cache(vnode)); -#endif - - if (folio_test_private(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio); - folio_detach_private(folio); - } - - /* Indicate that the folio can be released */ - _leave(" = T"); - return true; -} +const struct netfs_request_ops afs_req_ops = { + .init_request = afs_init_request, + .free_request = afs_free_request, + .check_write_begin = afs_check_write_begin, + .issue_read = afs_issue_read, + .update_i_size = afs_update_i_size, + .invalidate_cache = afs_netfs_invalidate_cache, + .create_write_requests = afs_create_write_requests, +}; static void afs_add_open_mmap(struct afs_vnode *vnode) { @@ -576,28 +468,39 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); + struct inode *inode = file_inode(iocb->ki_filp); + struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = iocb->ki_filp->private_data; - int ret; + ssize_t ret; - ret = afs_validate(vnode, af->key); + if (iocb->ki_flags & IOCB_DIRECT) + return netfs_unbuffered_read_iter(iocb, iter); + + ret = netfs_start_io_read(inode); if (ret < 0) return ret; - - return generic_file_read_iter(iocb, iter); + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + return ret; } static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(in)); + struct inode *inode = file_inode(in); + struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = in->private_data; - int ret; + ssize_t ret; - ret = afs_validate(vnode, af->key); + ret = netfs_start_io_read(inode); if (ret < 0) return ret; - - return filemap_splice_read(in, ppos, pipe, len, flags); + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_splice_read(in, ppos, pipe, len, flags); + netfs_end_io_read(inode); + return ret; } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 4f04f6f33f46..94fc049aff58 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren */ static void afs_set_netfs_context(struct afs_vnode *vnode) { - netfs_inode_init(&vnode->netfs, &afs_req_ops); + netfs_inode_init(&vnode->netfs, &afs_req_ops, true); } /* @@ -166,6 +166,7 @@ static void afs_apply_status(struct afs_operation *op, struct inode *inode = &vnode->netfs.inode; struct timespec64 t; umode_t mode; + bool unexpected_jump = false; bool data_changed = false; bool change_size = vp->set_size; @@ -230,6 +231,7 @@ static void afs_apply_status(struct afs_operation *op, } change_size = true; data_changed = true; + unexpected_jump = true; } else if (vnode->status.type == AFS_FTYPE_DIR) { /* Expected directory change is handled elsewhere so * that we can locally edit the directory and save on a @@ -249,8 +251,10 @@ static void afs_apply_status(struct afs_operation *op, * what's on the server. */ vnode->netfs.remote_i_size = status->size; - if (change_size) { + if (change_size || status->size > i_size_read(inode)) { afs_set_i_size(vnode, status->size); + if (unexpected_jump) + vnode->netfs.zero_point = status->size; inode_set_ctime_to_ts(inode, t); inode_set_atime_to_ts(inode, t); } @@ -647,7 +651,7 @@ void afs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); - fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux); + netfs_clear_inode_writeback(inode, &aux); clear_inode(inode); while (!list_empty(&vnode->wb_keys)) { @@ -689,17 +693,17 @@ static void afs_setattr_success(struct afs_operation *op) static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->netfs.inode; + struct afs_vnode *vnode = vp->vnode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; loff_t i_size = op->setattr.old_i_size; - if (size < i_size) - truncate_pagecache(inode, size); - if (size != i_size) - fscache_resize_cookie(afs_vnode_cache(vp->vnode), - vp->scb.status.size); + if (size != i_size) { + truncate_setsize(&vnode->netfs.inode, size); + netfs_resize_file(&vnode->netfs, size, true); + fscache_resize_cookie(afs_vnode_cache(vnode), size); + } } } @@ -767,11 +771,11 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && attr->ia_size < i_size && - attr->ia_size > vnode->status.size) { - truncate_pagecache(inode, attr->ia_size); + attr->ia_size > vnode->netfs.remote_i_size) { + truncate_setsize(inode, attr->ia_size); + netfs_resize_file(&vnode->netfs, size, false); fscache_resize_cookie(afs_vnode_cache(vnode), attr->ia_size); - i_size_write(inode, attr->ia_size); ret = 0; goto out_unlock; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 55aa0679d8ce..9c03fcf7ffaa 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -985,62 +985,6 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl i_size_read(&vnode->netfs.inode), flags); } -/* - * We use folio->private to hold the amount of the folio that we've written to, - * splitting the field into two parts. However, we need to represent a range - * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio - * exceeds what we can encode. - */ -#ifdef CONFIG_64BIT -#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL -#define __AFS_FOLIO_PRIV_SHIFT 32 -#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL -#else -#define __AFS_FOLIO_PRIV_MASK 0x7fffUL -#define __AFS_FOLIO_PRIV_SHIFT 16 -#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL -#endif - -static inline unsigned int afs_folio_dirty_resolution(struct folio *folio) -{ - int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1); - return (shift > 0) ? shift : 0; -} - -static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv) -{ - unsigned long x = priv & __AFS_FOLIO_PRIV_MASK; - - /* The lower bound is inclusive */ - return x << afs_folio_dirty_resolution(folio); -} - -static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv) -{ - unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK; - - /* The upper bound is immediately beyond the region */ - return (x + 1) << afs_folio_dirty_resolution(folio); -} - -static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to) -{ - unsigned int res = afs_folio_dirty_resolution(folio); - from >>= res; - to = (to - 1) >> res; - return (to << __AFS_FOLIO_PRIV_SHIFT) | from; -} - -static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv) -{ - return priv | __AFS_FOLIO_PRIV_MMAPPED; -} - -static inline bool afs_is_folio_dirty_mmapped(unsigned long priv) -{ - return priv & __AFS_FOLIO_PRIV_MMAPPED; -} - #include <trace/events/afs.h> /*****************************************************************************/ @@ -1167,7 +1111,6 @@ extern int afs_release(struct inode *, struct file *); extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); -extern int afs_write_inode(struct inode *, struct writeback_control *); static inline struct afs_read *afs_get_read(struct afs_read *req) { @@ -1658,24 +1601,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ -#ifdef CONFIG_AFS_FSCACHE -bool afs_dirty_folio(struct address_space *, struct folio *); -#else -#define afs_dirty_folio filemap_dirty_folio -#endif -extern int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata); -extern int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); -extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); -extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); -int afs_launder_folio(struct folio *); +void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len); /* * xattr.c diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 3bd02571f30d..15eab053af6d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -166,7 +166,7 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) if (!preflist) { seq_puts(m, "NO PREFS\n"); - return 0; + goto out; } seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n", @@ -191,7 +191,8 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) } } - rcu_read_lock(); +out: + rcu_read_unlock(); return 0; } diff --git a/fs/afs/super.c b/fs/afs/super.c index ae2d66a52add..f3ba1c3e72f5 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -55,7 +55,7 @@ int afs_net_id; static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, - .write_inode = afs_write_inode, + .write_inode = netfs_unpin_writeback, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .free_inode = afs_free_inode, diff --git a/fs/afs/write.c b/fs/afs/write.c index 61d34ad2ca7d..74402d95a884 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -12,309 +12,17 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/netfs.h> +#include <trace/events/netfs.h> #include "internal.h" -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, - bool max_one_loop); - -static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len, - loff_t i_size, bool caching); - -#ifdef CONFIG_AFS_FSCACHE -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -bool afs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - return fscache_dirty_folio(mapping, folio, - afs_vnode_cache(AFS_FS_I(mapping->host))); -} -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ - if (caching) - folio_start_fscache(folio); -} -#else -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ -} -#endif - -/* - * Flush out a conflicting write. This may extend the write to the surrounding - * pages if also dirty and contiguous to the conflicting region.. - */ -static int afs_flush_conflicting_write(struct address_space *mapping, - struct folio *folio) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = folio_pos(folio), - .range_end = LLONG_MAX, - }; - loff_t next; - - return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX, - &next, true); -} - -/* - * prepare to perform part of a write to a page - */ -int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **_page, void **fsdata) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct folio *folio; - unsigned long priv; - unsigned f, from; - unsigned t, to; - pgoff_t index; - int ret; - - _enter("{%llx:%llu},%llx,%x", - vnode->fid.vid, vnode->fid.vnode, pos, len); - - /* Prefetch area to be written into the cache if we're caching this - * file. We need to do this before we get a lock on the page in case - * there's more than one writer competing for the same cache block. - */ - ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata); - if (ret < 0) - return ret; - - index = folio_index(folio); - from = pos - index * PAGE_SIZE; - to = from + len; - -try_again: - /* See if this page is already partially written in a way that we can - * merge the new write with. - */ - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - ASSERTCMP(f, <=, t); - - if (folio_test_writeback(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio); - folio_unlock(folio); - goto wait_for_writeback; - } - /* If the file is being filled locally, allow inter-write - * spaces to be merged into writes. If it's not, only write - * back what the user gives us. - */ - if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && - (to < f || from > t)) - goto flush_conflicting_write; - } - - *_page = folio_file_page(folio, pos / PAGE_SIZE); - _leave(" = 0"); - return 0; - - /* The previous write and this write aren't adjacent or overlapping, so - * flush the page out. - */ -flush_conflicting_write: - trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio); - folio_unlock(folio); - - ret = afs_flush_conflicting_write(mapping, folio); - if (ret < 0) - goto error; - -wait_for_writeback: - ret = folio_wait_writeback_killable(folio); - if (ret < 0) - goto error; - - ret = folio_lock_killable(folio); - if (ret < 0) - goto error; - goto try_again; - -error: - folio_put(folio); - _leave(" = %d", ret); - return ret; -} - -/* - * finalise part of a write to a page - */ -int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *subpage, void *fsdata) -{ - struct folio *folio = page_folio(subpage); - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - unsigned long priv; - unsigned int f, from = offset_in_folio(folio, pos); - unsigned int t, to = from + copied; - loff_t i_size, write_end_pos; - - _enter("{%llx:%llu},{%lx}", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - if (!folio_test_uptodate(folio)) { - if (copied < len) { - copied = 0; - goto out; - } - - folio_mark_uptodate(folio); - } - - if (copied == 0) - goto out; - - write_end_pos = pos + copied; - - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) { - write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) - afs_set_i_size(vnode, write_end_pos); - write_sequnlock(&vnode->cb_lock); - fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos); - } - - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (from < f) - f = from; - if (to > t) - t = to; - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio); - } else { - priv = afs_folio_dirty(folio, from, to); - folio_attach_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio); - } - - if (folio_mark_dirty(folio)) - _debug("dirtied %lx", folio_index(folio)); - -out: - folio_unlock(folio); - folio_put(folio); - return copied; -} - -/* - * kill all the pages in the given range - */ -static void afs_kill_pages(struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("kill %lx (to %lx)", index, last); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = folio_next_index(folio); - - folio_clear_uptodate(folio); - folio_end_writeback(folio); - folio_lock(folio); - generic_error_remove_folio(mapping, folio); - folio_unlock(folio); - folio_put(folio); - - } while (index = next, index <= last); - - _leave(""); -} - -/* - * Redirty all the pages in a given range. - */ -static void afs_redirty_pages(struct writeback_control *wbc, - struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("redirty %llx @%llx", len, start); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = index + folio_nr_pages(folio); - folio_redirty_for_writepage(wbc, folio); - folio_end_writeback(folio); - folio_put(folio); - } while (index = next, index <= last); - - _leave(""); -} - /* * completion of write to server */ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct address_space *mapping = vnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t end; - - XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); - _enter("{%llx:%llu},{%x @%llx}", vnode->fid.vid, vnode->fid.vnode, len, start); - rcu_read_lock(); - - end = (start + len - 1) / PAGE_SIZE; - xas_for_each(&xas, folio, end) { - if (!folio_test_writeback(folio)) { - kdebug("bad %x @%llx page %lx %lx", - len, start, folio_index(folio), end); - ASSERT(folio_test_writeback(folio)); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio); - folio_detach_private(folio); - folio_end_writeback(folio); - } - - rcu_read_unlock(); - afs_prune_wb_keys(vnode); _leave(""); } @@ -451,363 +159,53 @@ try_next_key: return afs_put_operation(op); } -/* - * Extend the region to be written back to include subsequent contiguously - * dirty pages if possible, but don't sleep while doing so. - * - * If this page holds new content, then we can include filler zeros in the - * writeback. - */ -static void afs_extend_writeback(struct address_space *mapping, - struct afs_vnode *vnode, - long *_count, - loff_t start, - loff_t max_len, - bool new_content, - bool caching, - unsigned int *_len) +static void afs_upload_to_server(struct netfs_io_subrequest *subreq) { - struct folio_batch fbatch; - struct folio *folio; - unsigned long priv; - unsigned int psize, filler = 0; - unsigned int f, t; - loff_t len = *_len; - pgoff_t index = (start + len) / PAGE_SIZE; - bool stop = true; - unsigned int i; - - XA_STATE(xas, &mapping->i_pages, index); - folio_batch_init(&fbatch); - - do { - /* Firstly, we gather up a batch of contiguous dirty pages - * under the RCU read lock - but we can't clear the dirty flags - * there if any of those pages are mapped. - */ - rcu_read_lock(); - - xas_for_each(&xas, folio, ULONG_MAX) { - stop = true; - if (xas_retry(&xas, folio)) - continue; - if (xa_is_value(folio)) - break; - if (folio_index(folio) != index) - break; - - if (!folio_try_get_rcu(folio)) { - xas_reset(&xas); - continue; - } - - /* Has the page moved or been split? */ - if (unlikely(folio != xas_reload(&xas))) { - folio_put(folio); - break; - } - - if (!folio_trylock(folio)) { - folio_put(folio); - break; - } - if (!folio_test_dirty(folio) || - folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - folio_put(folio); - break; - } - - psize = folio_size(folio); - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (f != 0 && !new_content) { - folio_unlock(folio); - folio_put(folio); - break; - } - - len += filler + t; - filler = psize - t; - if (len >= max_len || *_count <= 0) - stop = true; - else if (t == psize || new_content) - stop = false; - - index += folio_nr_pages(folio); - if (!folio_batch_add(&fbatch, folio)) - break; - if (stop) - break; - } - - if (!stop) - xas_pause(&xas); - rcu_read_unlock(); - - /* Now, if we obtained any folios, we can shift them to being - * writable and mark them for caching. - */ - if (!folio_batch_count(&fbatch)) - break; - - for (i = 0; i < folio_batch_count(&fbatch); i++) { - folio = fbatch.folios[i]; - trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio); - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - folio_start_writeback(folio); - afs_folio_start_fscache(caching, folio); - - *_count -= folio_nr_pages(folio); - folio_unlock(folio); - } + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + ssize_t ret; - folio_batch_release(&fbatch); - cond_resched(); - } while (!stop); + _enter("%x[%x],%zx", + subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count); - *_len = len; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + ret = afs_store_data(vnode, &subreq->io_iter, subreq->start, + subreq->rreq->origin == NETFS_LAUNDER_WRITE); + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, + false); } -/* - * Synchronously write back the locked page and any subsequent non-locked dirty - * pages. - */ -static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, - struct writeback_control *wbc, - struct folio *folio, - loff_t start, loff_t end) +static void afs_upload_to_server_worker(struct work_struct *work) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct iov_iter iter; - unsigned long priv; - unsigned int offset, to, len, max_len; - loff_t i_size = i_size_read(&vnode->netfs.inode); - bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); - long count = wbc->nr_to_write; - int ret; - - _enter(",%lx,%llx-%llx", folio_index(folio), start, end); - - folio_start_writeback(folio); - afs_folio_start_fscache(caching, folio); - - count -= folio_nr_pages(folio); - - /* Find all consecutive lockable dirty pages that have contiguous - * written regions, stopping when we find a page that is not - * immediately lockable, is not dirty or is missing, or we reach the - * end of the range. - */ - priv = (unsigned long)folio_get_private(folio); - offset = afs_folio_dirty_from(folio, priv); - to = afs_folio_dirty_to(folio, priv); - trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio); - - len = to - offset; - start += offset; - if (start < i_size) { - /* Trim the write to the EOF; the extra data is ignored. Also - * put an upper limit on the size of a single storedata op. - */ - max_len = 65536 * 4096; - max_len = min_t(unsigned long long, max_len, end - start + 1); - max_len = min_t(unsigned long long, max_len, i_size - start); - - if (len < max_len && - (to == folio_size(folio) || new_content)) - afs_extend_writeback(mapping, vnode, &count, - start, max_len, new_content, - caching, &len); - len = min_t(loff_t, len, max_len); - } - - /* We now have a contiguous set of dirty pages, each with writeback - * set; the first page is still locked at this point, but all the rest - * have been unlocked. - */ - folio_unlock(folio); - - if (start < i_size) { - _debug("write back %x @%llx [%llx]", len, start, i_size); - - /* Speculatively write to the cache. We have to fix this up - * later if the store fails. - */ - afs_write_to_cache(vnode, start, len, i_size, caching); - - iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); - ret = afs_store_data(vnode, &iter, start, false); - } else { - _debug("write discard %x @%llx [%llx]", len, start, i_size); - - /* The dirty region was entirely beyond the EOF. */ - fscache_clear_page_bits(mapping, start, len, caching); - afs_pages_written_back(vnode, start, len); - ret = 0; - } - - switch (ret) { - case 0: - wbc->nr_to_write = count; - ret = len; - break; + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); - default: - pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); - fallthrough; - case -EACCES: - case -EPERM: - case -ENOKEY: - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EKEYREVOKED: - case -ENETRESET: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, ret); - break; - - case -EDQUOT: - case -ENOSPC: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, -ENOSPC); - break; - - case -EROFS: - case -EIO: - case -EREMOTEIO: - case -EFBIG: - case -ENOENT: - case -ENOMEDIUM: - case -ENXIO: - trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); - afs_kill_pages(mapping, start, len); - mapping_set_error(mapping, ret); - break; - } - - _leave(" = %d", ret); - return ret; + afs_upload_to_server(subreq); } /* - * write a region of pages back to the server + * Set up write requests for a writeback slice. We need to add a write request + * for each write we want to make. */ -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, - bool max_one_loop) +void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len) { - struct folio *folio; - struct folio_batch fbatch; - ssize_t ret; - unsigned int i; - int n, skips = 0; - - _enter("%llx,%llx,", start, end); - folio_batch_init(&fbatch); - - do { - pgoff_t index = start / PAGE_SIZE; - - n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, &fbatch); - - if (!n) - break; - for (i = 0; i < n; i++) { - folio = fbatch.folios[i]; - start = folio_pos(folio); /* May regress with THPs */ - - _debug("wback %lx", folio_index(folio)); - - /* At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping - */ -try_again: - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); - if (ret < 0) { - folio_batch_release(&fbatch); - return ret; - } - } else { - if (!folio_trylock(folio)) - continue; - } - - if (folio->mapping != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - continue; - } - - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); -#ifdef CONFIG_AFS_FSCACHE - folio_wait_fscache(folio); -#endif - goto try_again; - } - - start += folio_size(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) { - *_next = start; - folio_batch_release(&fbatch); - _leave(" = 0 [%llx]", *_next); - return 0; - } - skips++; - } - continue; - } - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - ret = afs_write_back_from_locked_folio(mapping, wbc, - folio, start, end); - if (ret < 0) { - _leave(" = %zd", ret); - folio_batch_release(&fbatch); - return ret; - } - - start += ret; - } + struct netfs_io_subrequest *subreq; - folio_batch_release(&fbatch); - cond_resched(); - } while (wbc->nr_to_write > 0); + _enter("%x,%llx-%llx", wreq->debug_id, start, start + len); - *_next = start; - _leave(" = 0 [%llx]", *_next); - return 0; + subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER, + start, len, afs_upload_to_server_worker); + if (subreq) + netfs_queue_write_request(subreq); } /* * write some of the pending data back to the server */ -int afs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - loff_t start, next; int ret; - _enter(""); - /* We have to be careful as we can end up racing with setattr() * truncating the pagecache since the caller doesn't take a lock here * to prevent it. @@ -817,69 +215,12 @@ int afs_writepages(struct address_space *mapping, else if (!down_read_trylock(&vnode->validate_lock)) return 0; - if (wbc->range_cyclic) { - start = mapping->writeback_index * PAGE_SIZE; - ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, - &next, false); - if (ret == 0) { - mapping->writeback_index = next / PAGE_SIZE; - if (start > 0 && wbc->nr_to_write > 0) { - ret = afs_writepages_region(mapping, wbc, 0, - start, &next, false); - if (ret == 0) - mapping->writeback_index = - next / PAGE_SIZE; - } - } - } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, - &next, false); - if (wbc->nr_to_write > 0 && ret == 0) - mapping->writeback_index = next / PAGE_SIZE; - } else { - ret = afs_writepages_region(mapping, wbc, - wbc->range_start, wbc->range_end, - &next, false); - } - + ret = netfs_writepages(mapping, wbc); up_read(&vnode->validate_lock); - _leave(" = %d", ret); return ret; } /* - * write to an AFS file - */ -ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); - struct afs_file *af = iocb->ki_filp->private_data; - ssize_t result; - size_t count = iov_iter_count(from); - - _enter("{%llx:%llu},{%zu},", - vnode->fid.vid, vnode->fid.vnode, count); - - if (IS_SWAPFILE(&vnode->netfs.inode)) { - printk(KERN_INFO - "AFS: Attempt to write to active swap file!\n"); - return -EBUSY; - } - - if (!count) - return 0; - - result = afs_validate(vnode, af->key); - if (result < 0) - return result; - - result = generic_file_write_iter(iocb, from); - - _leave(" = %zd", result); - return result; -} - -/* * flush any dirty pages for this process, and check for write errors. * - the return status from this call provides a reliable indication of * whether any write errors occurred for this process. @@ -907,59 +248,11 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; - struct inode *inode = file_inode(file); - struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_file *af = file->private_data; - unsigned long priv; - vm_fault_t ret = VM_FAULT_RETRY; - - _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - afs_validate(vnode, af->key); - sb_start_pagefault(inode->i_sb); - - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. - */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) - goto out; -#endif - - if (folio_wait_writeback_killable(folio)) - goto out; - - if (folio_lock_killable(folio) < 0) - goto out; - - /* We mustn't change folio->private until writeback is complete as that - * details the portion of the page we need to write back and we might - * need to redirty the page if there's a problem. - */ - if (folio_wait_writeback_killable(folio) < 0) { - folio_unlock(folio); - goto out; - } - - priv = afs_folio_dirty(folio, 0, folio_size(folio)); - priv = afs_folio_dirty_mmapped(priv); - if (folio_test_private(folio)) { - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio); - } else { - folio_attach_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio); - } - file_update_time(file); - - ret = VM_FAULT_LOCKED; -out: - sb_end_pagefault(inode->i_sb); - return ret; + if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0) + return VM_FAULT_SIGBUS; + return netfs_page_mkwrite(vmf, NULL); } /* @@ -989,64 +282,3 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) afs_put_wb_key(wbk); } } - -/* - * Clean up a page during invalidation. - */ -int afs_launder_folio(struct folio *folio) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - struct iov_iter iter; - struct bio_vec bv; - unsigned long priv; - unsigned int f, t; - int ret = 0; - - _enter("{%lx}", folio->index); - - priv = (unsigned long)folio_get_private(folio); - if (folio_clear_dirty_for_io(folio)) { - f = 0; - t = folio_size(folio); - if (folio_test_private(folio)) { - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - } - - bvec_set_folio(&bv, folio, t - f, f); - iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len); - - trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); - ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio); - folio_detach_private(folio); - folio_wait_fscache(folio); - return ret; -} - -/* - * Deal with the completion of writing the data to the cache. - */ -static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct afs_vnode *vnode = priv; - - if (IS_ERR_VALUE(transferred_or_error) && - transferred_or_error != -ENOBUFS) - afs_invalidate_cache(vnode, 0); -} - -/* - * Save the write to the cache also. - */ -static void afs_write_to_cache(struct afs_vnode *vnode, - loff_t start, size_t len, loff_t i_size, - bool caching) -{ - fscache_write_to_cache(afs_vnode_cache(vnode), - vnode->netfs.inode.i_mapping, start, len, i_size, - afs_write_to_cache_done, vnode, caching); -} diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 7423a3557c68..1a05cecda7cc 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -27,7 +27,6 @@ bcachefs-y := \ checksum.o \ clock.o \ compress.o \ - counters.o \ darray.o \ debug.o \ dirent.o \ @@ -71,6 +70,7 @@ bcachefs-y := \ reflink.o \ replicas.o \ sb-clean.o \ + sb-counters.o \ sb-downgrade.o \ sb-errors.o \ sb-members.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index a09b9d00226a..10704f2d3af5 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -273,7 +273,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), c, err, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", - bch2_data_types[a.v->data_type]); + bch2_data_type_str(a.v->data_type)); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.v->cached_sectors || @@ -321,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c { struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - unsigned i; prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "gen %u oldest_gen %u data_type %s", - a->gen, a->oldest_gen, - a->data_type < BCH_DATA_NR - ? bch2_data_types[a->data_type] - : "(invalid data type)"); + prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); + bch2_prt_data_type(out, a->data_type); prt_newline(out); prt_printf(out, "journal_seq %llu", a->journal_seq); prt_newline(out); @@ -353,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "fragmentation %llu", a->fragmentation_lru); prt_newline(out); prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); - prt_newline(out); - - if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { - struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); - const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); - - prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); - printbuf_indent_add(out, 2); - - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { - prt_newline(out); - bch2_backpointer_to_text(out, &bps[i]); - } - - printbuf_indent_sub(out, 2); - } - printbuf_indent_sub(out, 2); } @@ -839,7 +818,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, } } - if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; u64 journal_seq = trans->journal_res.seq; u64 bucket_journal_seq = new_a->journal_seq; @@ -1625,13 +1604,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } +struct discard_buckets_state { + u64 seen; + u64 open; + u64 need_journal_commit; + u64 discarded; + struct bch_dev *ca; + u64 need_journal_commit_this_dev; +}; + +static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) +{ + if (s->ca == ca) + return; + + if (s->ca && s->need_journal_commit_this_dev > + bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) + bch2_journal_flush_async(&c->journal, NULL); + + if (s->ca) + percpu_ref_put(&s->ca->ref); + if (ca) + percpu_ref_get(&ca->ref); + s->ca = ca; + s->need_journal_commit_this_dev = 0; +} + static int bch2_discard_one_bucket(struct btree_trans *trans, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, - u64 *seen, - u64 *open, - u64 *need_journal_commit, - u64 *discarded) + struct discard_buckets_state *s) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; @@ -1643,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); return 0; } + discard_buckets_next_dev(c, s, ca); + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { - (*open)++; + s->open++; goto out; } if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, pos.inode, pos.offset)) { - (*need_journal_commit)++; + s->need_journal_commit++; + s->need_journal_commit_this_dev++; goto out; } @@ -1732,9 +1738,9 @@ write: goto out; count_event(c, bucket_discard); - (*discarded)++; + s->discarded++; out: - (*seen)++; + s->seen++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); @@ -1744,7 +1750,7 @@ out: static void bch2_do_discards_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_work); - u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1756,19 +1762,14 @@ static void bch2_do_discards_work(struct work_struct *work) ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, - &seen, - &open, - &need_journal_commit, - &discarded))); - - if (need_journal_commit * 2 > seen) - bch2_journal_flush_async(&c->journal, NULL); + bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + discard_buckets_next_dev(c, &s, NULL); - trace_discard_buckets(c, seen, open, need_journal_commit, discarded, + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h new file mode 100644 index 000000000000..b4ec20be93b8 --- /dev/null +++ b/fs/bcachefs/alloc_background_format.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H +#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H + +struct bch_alloc { + struct bch_val v; + __u8 fields; + __u8 gen; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V1() \ + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v3 { + struct bch_val v; + __le64 journal_seq; + __le32 flags; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + +struct bch_alloc_v4 { + struct bch_val v; + __u64 journal_seq; + __u32 flags; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 stripe_redundancy; + __u32 dirty_sectors; + __u32 cached_sectors; + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; + __u64 fragmentation_lru; +} __packed __aligned(8); + +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) +BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) +BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + +#define KEY_TYPE_BUCKET_GENS_BITS 8 +#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) +#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) + +struct bch_bucket_gens { + struct bch_val v; + u8 gens[KEY_TYPE_BUCKET_GENS_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index b0ff47998a94..633d3223b353 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1525,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ - prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", + prt_printf(out, "%zu ref %u ", ob - c->open_buckets, - atomic_read(&ob->pin), - data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", + atomic_read(&ob->pin)); + bch2_prt_data_type(out, data_type); + prt_printf(out, " %u:%llu gen %u allocated %u/%u", ob->dev, ob->bucket, ob->gen, ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); if (ob->ec) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index e358a2ffffde..b4dc319bcb2b 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -400,13 +400,24 @@ int bch2_check_btree_backpointers(struct bch_fs *c) return ret; } +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + +struct extents_to_bp_state { + struct bpos bucket_start; + struct bpos bucket_end; + struct bkey_buf last_flushed; +}; + static int check_bp_exists(struct btree_trans *trans, + struct extents_to_bp_state *s, struct bpos bucket, struct bch_backpointer bp, - struct bkey_s_c orig_k, - struct bpos bucket_start, - struct bpos bucket_end, - struct bkey_buf *last_flushed) + struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; struct btree_iter bp_iter = { NULL }; @@ -417,8 +428,8 @@ static int check_bp_exists(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); - if (bpos_lt(bucket, bucket_start) || - bpos_gt(bucket, bucket_end)) + if (bpos_lt(bucket, s->bucket_start) || + bpos_gt(bucket, s->bucket_end)) return 0; if (!bch2_dev_bucket_exists(c, bucket)) @@ -433,11 +444,9 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) || - bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) || - memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) { - bch2_bkey_buf_reassemble(&tmp, c, orig_k); + bch2_bkey_buf_reassemble(&tmp, c, orig_k); + if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { if (bp.level) { bch2_trans_unlock(trans); bch2_btree_interior_updates_flush(c); @@ -447,7 +456,7 @@ static int check_bp_exists(struct btree_trans *trans, if (ret) goto err; - bch2_bkey_buf_copy(last_flushed, c, tmp.k); + bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } @@ -475,10 +484,8 @@ missing: } static int check_extent_to_backpointers(struct btree_trans *trans, + struct extents_to_bp_state *s, enum btree_id btree, unsigned level, - struct bpos bucket_start, - struct bpos bucket_end, - struct bkey_buf *last_flushed, struct bkey_s_c k) { struct bch_fs *c = trans->c; @@ -498,9 +505,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, bch2_extent_ptr_to_bp(c, btree, level, k, p, &bucket_pos, &bp); - ret = check_bp_exists(trans, bucket_pos, bp, k, - bucket_start, bucket_end, - last_flushed); + ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) return ret; } @@ -509,10 +514,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans, } static int check_btree_root_to_backpointers(struct btree_trans *trans, + struct extents_to_bp_state *s, enum btree_id btree_id, - struct bpos bucket_start, - struct bpos bucket_end, - struct bkey_buf *last_flushed, int *level) { struct bch_fs *c = trans->c; @@ -536,9 +539,7 @@ retry: *level = b->c.level; k = bkey_i_to_s_c(&b->key); - ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1, - bucket_start, bucket_end, - last_flushed, k); + ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -559,7 +560,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c) si_meminfo(&i); mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, btree_bytes(c)); + return div_u64(mem_bytes >> 1, c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, @@ -610,43 +611,35 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, } static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct bpos bucket_start, - struct bpos bucket_end) + struct extents_to_bp_state *s) { struct bch_fs *c = trans->c; - struct btree_iter iter; - enum btree_id btree_id; - struct bkey_s_c k; - struct bkey_buf last_flushed; int ret = 0; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { + for (enum btree_id btree_id = 0; + btree_id < btree_id_nr_alive(c); + btree_id++) { int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_btree_root_to_backpointers(trans, btree_id, - bucket_start, bucket_end, - &last_flushed, &level)); + check_btree_root_to_backpointers(trans, s, btree_id, &level)); if (ret) return ret; while (level >= depth) { + struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, BTREE_ITER_PREFETCH); while (1) { bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&iter); + + struct bkey_s_c k = bch2_btree_iter_peek(&iter); if (!k.k) break; ret = bkey_err(k) ?: - check_extent_to_backpointers(trans, btree_id, level, - bucket_start, bucket_end, - &last_flushed, k) ?: + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { @@ -668,7 +661,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, } } - bch2_bkey_buf_exit(&last_flushed, c); return 0; } @@ -731,37 +723,43 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct bpos start = POS_MIN, end; + struct extents_to_bp_state s = { .bucket_start = POS_MIN }; int ret; + bch2_bkey_buf_init(&s.last_flushed); + bkey_init(&s.last_flushed.k->k); + while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, start, &end); + ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); if (ret) break; - if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX)) + if ( bpos_eq(s.bucket_start, POS_MIN) && + !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) { + if (!bpos_eq(s.bucket_start, POS_MIN) || + !bpos_eq(s.bucket_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, start); + bch2_bpos_to_text(&buf, s.bucket_start); prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, end); + bch2_bpos_to_text(&buf, s.bucket_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } - ret = bch2_check_extents_to_backpointers_pass(trans, start, end); - if (ret || bpos_eq(end, SPOS_MAX)) + ret = bch2_check_extents_to_backpointers_pass(trans, &s); + if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) break; - start = bpos_successor(end); + s.bucket_start = bpos_successor(s.bucket_end); } bch2_trans_put(trans); + bch2_bkey_buf_exit(&s.last_flushed, c); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 737e2396ade7..327365a9feac 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H #define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#include "btree_cache.h" #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index dac383e37181..b80c6c9efd8c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1204,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c) return c->opts.block_size >> 9; } -static inline size_t btree_sectors(const struct bch_fs *c) -{ - return c->opts.btree_node_size >> 9; -} - static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) { return c->btree_key_cache_btrees & (1U << btree); diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0d5ac4184fbc..0668b682a21c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -417,600 +417,12 @@ struct bch_set { struct bch_val v; }; -/* Extents */ - -/* - * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally - * preceded by checksum/compression information (bch_extent_crc32 or - * bch_extent_crc64). - * - * One major determining factor in the format of extents is how we handle and - * represent extents that have been partially overwritten and thus trimmed: - * - * If an extent is not checksummed or compressed, when the extent is trimmed we - * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the data that is currently - * live. The size field in struct bkey records the current (live) size of the - * extent, and is also used to mean "size of region on disk that we point to" in - * this case. - * - * Thus an extent that is not checksummed or compressed will consist only of a - * list of bch_extent_ptrs, with none of the fields in - * bch_extent_crc32/bch_extent_crc64. - * - * When an extent is checksummed or compressed, it's not possible to read only - * the data that is currently live: we have to read the entire extent that was - * originally written, and then return only the part of the extent that is - * currently live. - * - * Thus, in addition to the current size of the extent in struct bkey, we need - * to store the size of the originally allocated space - this is the - * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, - * when the extent is trimmed, instead of modifying the offset field of the - * pointer, we keep a second smaller offset field - "offset into the original - * extent of the currently live region". - * - * The other major determining factor is replication and data migration: - * - * Each pointer may have its own bch_extent_crc32/64. When doing a replicated - * write, we will initially write all the replicas in the same format, with the - * same checksum type and compression format - however, when copygc runs later (or - * tiering/cache promotion, anything that moves data), it is not in general - * going to rewrite all the pointers at once - one of the replicas may be in a - * bucket on one device that has very little fragmentation while another lives - * in a bucket that has become heavily fragmented, and thus is being rewritten - * sooner than the rest. - * - * Thus it will only move a subset of the pointers (or in the case of - * tiering/cache promotion perhaps add a single pointer without dropping any - * current pointers), and if the extent has been partially overwritten it must - * write only the currently live portion (or copygc would not be able to reduce - * fragmentation!) - which necessitates a different bch_extent_crc format for - * the new pointer. - * - * But in the interests of space efficiency, we don't want to store one - * bch_extent_crc for each pointer if we don't have to. - * - * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and - * bch_extent_ptrs appended arbitrarily one after the other. We determine the - * type of a given entry with a scheme similar to utf8 (except we're encoding a - * type, not a size), encoding the type in the position of the first set bit: - * - * bch_extent_crc32 - 0b1 - * bch_extent_ptr - 0b10 - * bch_extent_crc64 - 0b100 - * - * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and - * bch_extent_crc64 is the least constrained). - * - * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, - * until the next bch_extent_crc32/64. - * - * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer - * is neither checksummed nor compressed. - */ - /* 128 bits, sufficient for cryptographic MACs: */ struct bch_csum { __le64 lo; __le64 hi; } __packed __aligned(8); -#define BCH_EXTENT_ENTRY_TYPES() \ - x(ptr, 0) \ - x(crc32, 1) \ - x(crc64, 2) \ - x(crc128, 3) \ - x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 - -enum bch_extent_entry_type { -#define x(f, n) BCH_EXTENT_ENTRY_##f = n, - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -/* Compressed/uncompressed size are stored biased by 1: */ -struct bch_extent_crc32 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:2, - _compressed_size:7, - _uncompressed_size:7, - offset:7, - _unused:1, - csum_type:4, - compression_type:4; - __u32 csum; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u32 csum; - __u32 compression_type:4, - csum_type:4, - _unused:1, - offset:7, - _uncompressed_size:7, - _compressed_size:7, - type:2; -#endif -} __packed __aligned(8); - -#define CRC32_SIZE_MAX (1U << 7) -#define CRC32_NONCE_MAX 0 - -struct bch_extent_crc64 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:3, - _compressed_size:9, - _uncompressed_size:9, - offset:9, - nonce:10, - csum_type:4, - compression_type:4, - csum_hi:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 csum_hi:16, - compression_type:4, - csum_type:4, - nonce:10, - offset:9, - _uncompressed_size:9, - _compressed_size:9, - type:3; -#endif - __u64 csum_lo; -} __packed __aligned(8); - -#define CRC64_SIZE_MAX (1U << 9) -#define CRC64_NONCE_MAX ((1U << 10) - 1) - -struct bch_extent_crc128 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:4, - _compressed_size:13, - _uncompressed_size:13, - offset:13, - nonce:13, - csum_type:4, - compression_type:4; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 compression_type:4, - csum_type:4, - nonce:13, - offset:13, - _uncompressed_size:13, - _compressed_size:13, - type:4; -#endif - struct bch_csum csum; -} __packed __aligned(8); - -#define CRC128_SIZE_MAX (1U << 13) -#define CRC128_NONCE_MAX ((1U << 13) - 1) - -/* - * @reservation - pointer hasn't been written to, just reserved - */ -struct bch_extent_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:1, - cached:1, - unused:1, - unwritten:1, - offset:44, /* 8 petabytes */ - dev:8, - gen:8; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 gen:8, - dev:8, - offset:44, - unwritten:1, - unused:1, - cached:1, - type:1; -#endif -} __packed __aligned(8); - -struct bch_extent_stripe_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:5, - block:8, - redundancy:4, - idx:47; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:47, - redundancy:4, - block:8, - type:5; -#endif -}; - -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:34, - compression:8, /* enum bch_compression_opt */ - target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 target:16, - compression:8, - unused:34, - type:6; -#endif -}; - -union bch_extent_entry { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 - unsigned long type; -#elif __BITS_PER_LONG == 32 - struct { - unsigned long pad; - unsigned long type; - }; -#else -#error edit for your odd byteorder. -#endif - -#define x(f, n) struct bch_extent_##f f; - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -struct bch_btree_ptr { - struct bch_val v; - - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -struct bch_btree_ptr_v2 { - struct bch_val v; - - __u64 mem_ptr; - __le64 seq; - __le16 sectors_written; - __le16 flags; - struct bpos min_key; - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); - -struct bch_extent { - struct bch_val v; - - __u64 _data[0]; - union bch_extent_entry start[]; -} __packed __aligned(8); - -struct bch_reservation { - struct bch_val v; - - __le32 generation; - __u8 nr_replicas; - __u8 pad[3]; -} __packed __aligned(8); - -/* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(__u64)) - -/* Maximum possible size of an entire extent value: */ -#define BKEY_EXTENT_VAL_U64s_MAX \ - (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) - -/* * Maximum possible size of an entire extent, key + value: */ -#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) - -/* Btree pointers don't carry around checksums: */ -#define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) -#define BKEY_BTREE_PTR_U64s_MAX \ - (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - -/* Inodes */ - -#define BLOCKDEV_INODE_MAX 4096 - -#define BCACHEFS_ROOT_INO 4096 - -struct bch_inode { - struct bch_val v; - - __le64 bi_hash_seed; - __le32 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v2 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v3 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le64 bi_sectors; - __le64 bi_size; - __le64 bi_version; - __u8 fields[]; -} __packed __aligned(8); - -#define INODEv3_FIELDS_START_INITIAL 6 -#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) - -struct bch_inode_generation { - struct bch_val v; - - __le32 bi_generation; - __le32 pad; -} __packed __aligned(8); - -/* - * bi_subvol and bi_parent_subvol are only set for subvolume roots: - */ - -#define BCH_INODE_FIELDS_v2() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_size, 64) \ - x(bi_sectors, 64) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) - -#define BCH_INODE_FIELDS_v3() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) - -/* subset of BCH_INODE_FIELDS */ -#define BCH_INODE_OPTS() \ - x(data_checksum, 8) \ - x(compression, 8) \ - x(project, 32) \ - x(background_compression, 8) \ - x(data_replicas, 8) \ - x(promote_target, 16) \ - x(foreground_target, 16) \ - x(background_target, 16) \ - x(erasure_code, 16) \ - x(nocow, 8) - -enum inode_opt_id { -#define x(name, ...) \ - Inode_opt_##name, - BCH_INODE_OPTS() -#undef x - Inode_opt_nr, -}; - -#define BCH_INODE_FLAGS() \ - x(sync, 0) \ - x(immutable, 1) \ - x(append, 2) \ - x(nodump, 3) \ - x(noatime, 4) \ - x(i_size_dirty, 5) \ - x(i_sectors_dirty, 6) \ - x(unlinked, 7) \ - x(backptr_untrusted, 8) - -/* bits 20+ reserved for packed fields below: */ - -enum bch_inode_flags { -#define x(t, n) BCH_INODE_##t = 1U << n, - BCH_INODE_FLAGS() -#undef x -}; - -enum __bch_inode_flags { -#define x(t, n) __BCH_INODE_##t = n, - BCH_INODE_FLAGS() -#undef x -}; - -LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); - -LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); -LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); -LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_FIELDS_START, - struct bch_inode_v3, bi_flags, 31, 36); -LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); - -/* Dirents */ - -/* - * Dirents (and xattrs) have to implement string lookups; since our b-tree - * doesn't support arbitrary length strings for the key, we instead index by a - * 64 bit hash (currently truncated sha1) of the string, stored in the offset - * field of the key - using linear probing to resolve hash collisions. This also - * provides us with the readdir cookie posix requires. - * - * Linear probing requires us to use whiteouts for deletions, in the event of a - * collision: - */ - -struct bch_dirent { - struct bch_val v; - - /* Target inode number: */ - union { - __le64 d_inum; - struct { /* DT_SUBVOL */ - __le32 d_child_subvol; - __le32 d_parent_subvol; - }; - }; - - /* - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ - __u8 d_type; - - __u8 d_name[]; -} __packed __aligned(8); - -#define DT_SUBVOL 16 -#define BCH_DT_MAX 17 - -#define BCH_NAME_MAX 512 - -/* Xattrs */ - -#define KEY_TYPE_XATTR_INDEX_USER 0 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -#define KEY_TYPE_XATTR_INDEX_SECURITY 4 - -struct bch_xattr { - struct bch_val v; - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; - __u8 x_name[]; -} __packed __aligned(8); - -/* Bucket/allocation information: */ - -struct bch_alloc { - struct bch_val v; - __u8 fields; - __u8 gen; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V1() \ - x(read_time, 16) \ - x(write_time, 16) \ - x(data_type, 8) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ - x(oldest_gen, 8) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bch_alloc_v2 { - struct bch_val v; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V2() \ - x(read_time, 64) \ - x(write_time, 64) \ - x(dirty_sectors, 32) \ - x(cached_sectors, 32) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -struct bch_alloc_v3 { - struct bch_val v; - __le64 journal_seq; - __le32 flags; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) - -struct bch_alloc_v4 { - struct bch_val v; - __u64 journal_seq; - __u32 flags; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 stripe_redundancy; - __u32 dirty_sectors; - __u32 cached_sectors; - __u64 io_time[2]; - __u32 stripe; - __u32 nr_external_backpointers; - __u64 fragmentation_lru; -} __packed __aligned(8); - -#define BCH_ALLOC_V4_U64s_V0 6 -#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) - -BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) -BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) -BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) -BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) - -#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 - struct bch_backpointer { struct bch_val v; __u8 btree_id; @@ -1021,154 +433,6 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); -#define KEY_TYPE_BUCKET_GENS_BITS 8 -#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) -#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) - -struct bch_bucket_gens { - struct bch_val v; - u8 gens[KEY_TYPE_BUCKET_GENS_NR]; -} __packed __aligned(8); - -/* Quotas: */ - -enum quota_types { - QTYP_USR = 0, - QTYP_GRP = 1, - QTYP_PRJ = 2, - QTYP_NR = 3, -}; - -enum quota_counters { - Q_SPC = 0, - Q_INO = 1, - Q_COUNTERS = 2, -}; - -struct bch_quota_counter { - __le64 hardlimit; - __le64 softlimit; -}; - -struct bch_quota { - struct bch_val v; - struct bch_quota_counter c[Q_COUNTERS]; -} __packed __aligned(8); - -/* Erasure coding */ - -struct bch_stripe { - struct bch_val v; - __le16 sectors; - __u8 algorithm; - __u8 nr_blocks; - __u8 nr_redundant; - - __u8 csum_granularity_bits; - __u8 csum_type; - __u8 pad; - - struct bch_extent_ptr ptrs[]; -} __packed __aligned(8); - -/* Reflink: */ - -struct bch_reflink_p { - struct bch_val v; - __le64 idx; - /* - * A reflink pointer might point to an indirect extent which is then - * later split (by copygc or rebalance). If we only pointed to part of - * the original indirect extent, and then one of the fragments is - * outside the range we point to, we'd leak a refcount: so when creating - * reflink pointers, we need to store pad values to remember the full - * range we were taking a reference on. - */ - __le32 front_pad; - __le32 back_pad; -} __packed __aligned(8); - -struct bch_reflink_v { - struct bch_val v; - __le64 refcount; - union bch_extent_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -struct bch_indirect_inline_data { - struct bch_val v; - __le64 refcount; - u8 data[]; -}; - -/* Inline data */ - -struct bch_inline_data { - struct bch_val v; - u8 data[]; -}; - -/* Subvolumes: */ - -#define SUBVOL_POS_MIN POS(0, 1) -#define SUBVOL_POS_MAX POS(0, S32_MAX) -#define BCACHEFS_ROOT_SUBVOL 1 - -struct bch_subvolume { - struct bch_val v; - __le32 flags; - __le32 snapshot; - __le64 inode; - /* - * Snapshot subvolumes form a tree, separate from the snapshot nodes - * tree - if this subvolume is a snapshot, this is the ID of the - * subvolume it was created from: - */ - __le32 parent; - __le32 pad; - bch_le128 otime; -}; - -LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) -/* - * We need to know whether a subvolume is a snapshot so we can know whether we - * can delete it (or whether it should just be rm -rf'd) - */ -LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) -LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) - -/* Snapshots */ - -struct bch_snapshot { - struct bch_val v; - __le32 flags; - __le32 parent; - __le32 children[2]; - __le32 subvol; - /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ - __le32 tree; - __le32 depth; - __le32 skip[3]; -}; - -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) - -/* True if a subvolume points to this snapshot node: */ -LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) - -/* - * Snapshot trees: - * - * The snapshot_trees btree gives us persistent indentifier for each tree of - * bch_snapshot nodes, and allow us to record and easily find the root/master - * subvolume that other snapshots were created from: - */ -struct bch_snapshot_tree { - struct bch_val v; - __le32 master_subvol; - __le32 root_snapshot; -}; - /* LRU btree: */ struct bch_lru { @@ -1178,33 +442,6 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) -/* Logged operations btree: */ - -struct bch_logged_op_truncate { - struct bch_val v; - __le32 subvol; - __le32 pad; - __le64 inum; - __le64 new_i_size; -}; - -enum logged_op_finsert_state { - LOGGED_OP_FINSERT_start, - LOGGED_OP_FINSERT_shift_extents, - LOGGED_OP_FINSERT_finish, -}; - -struct bch_logged_op_finsert { - struct bch_val v; - __u8 state; - __u8 pad[3]; - __le32 subvol; - __le64 inum; - __le64 dst_offset; - __le64 src_offset; - __le64 pos; -}; - /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1230,6 +467,19 @@ struct bch_sb_field { x(ext, 13) \ x(downgrade, 14) +#include "alloc_background_format.h" +#include "extents_format.h" +#include "reflink_format.h" +#include "ec_format.h" +#include "inode_format.h" +#include "dirent_format.h" +#include "xattr_format.h" +#include "quota_format.h" +#include "logged_ops_format.h" +#include "snapshot_format.h" +#include "subvolume_format.h" +#include "sb-counters_format.h" + enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, BCH_SB_FIELDS() @@ -1465,23 +715,6 @@ struct bch_sb_field_replicas { struct bch_replicas_entry_v1 entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_quota: */ - -struct bch_sb_quota_counter { - __le32 timelimit; - __le32 warnlimit; -}; - -struct bch_sb_quota_type { - __le64 flags; - struct bch_sb_quota_counter c[Q_COUNTERS]; -}; - -struct bch_sb_field_quota { - struct bch_sb_field field; - struct bch_sb_quota_type q[QTYP_NR]; -} __packed __aligned(8); - /* BCH_SB_FIELD_disk_groups: */ #define BCH_SB_LABEL_SIZE 32 @@ -1500,101 +733,6 @@ struct bch_sb_field_disk_groups { struct bch_disk_group entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_counters */ - -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) \ - x(bucket_alloc, 5) \ - x(bucket_alloc_fail, 6) \ - x(btree_cache_scan, 7) \ - x(btree_cache_reap, 8) \ - x(btree_cache_cannibalize, 9) \ - x(btree_cache_cannibalize_lock, 10) \ - x(btree_cache_cannibalize_lock_fail, 11) \ - x(btree_cache_cannibalize_unlock, 12) \ - x(btree_node_write, 13) \ - x(btree_node_read, 14) \ - x(btree_node_compact, 15) \ - x(btree_node_merge, 16) \ - x(btree_node_split, 17) \ - x(btree_node_rewrite, 18) \ - x(btree_node_alloc, 19) \ - x(btree_node_free, 20) \ - x(btree_node_set_root, 21) \ - x(btree_path_relock_fail, 22) \ - x(btree_path_upgrade_fail, 23) \ - x(btree_reserve_get_fail, 24) \ - x(journal_entry_full, 25) \ - x(journal_full, 26) \ - x(journal_reclaim_finish, 27) \ - x(journal_reclaim_start, 28) \ - x(journal_write, 29) \ - x(read_promote, 30) \ - x(read_bounce, 31) \ - x(read_split, 33) \ - x(read_retry, 32) \ - x(read_reuse_race, 34) \ - x(move_extent_read, 35) \ - x(move_extent_write, 36) \ - x(move_extent_finish, 37) \ - x(move_extent_fail, 38) \ - x(move_extent_start_fail, 39) \ - x(copygc, 40) \ - x(copygc_wait, 41) \ - x(gc_gens_end, 42) \ - x(gc_gens_start, 43) \ - x(trans_blocked_journal_reclaim, 44) \ - x(trans_restart_btree_node_reused, 45) \ - x(trans_restart_btree_node_split, 46) \ - x(trans_restart_fault_inject, 47) \ - x(trans_restart_iter_upgrade, 48) \ - x(trans_restart_journal_preres_get, 49) \ - x(trans_restart_journal_reclaim, 50) \ - x(trans_restart_journal_res_get, 51) \ - x(trans_restart_key_cache_key_realloced, 52) \ - x(trans_restart_key_cache_raced, 53) \ - x(trans_restart_mark_replicas, 54) \ - x(trans_restart_mem_realloced, 55) \ - x(trans_restart_memory_allocation_failure, 56) \ - x(trans_restart_relock, 57) \ - x(trans_restart_relock_after_fill, 58) \ - x(trans_restart_relock_key_cache_fill, 59) \ - x(trans_restart_relock_next_node, 60) \ - x(trans_restart_relock_parent_for_fill, 61) \ - x(trans_restart_relock_path, 62) \ - x(trans_restart_relock_path_intent, 63) \ - x(trans_restart_too_many_iters, 64) \ - x(trans_restart_traverse, 65) \ - x(trans_restart_upgrade, 66) \ - x(trans_restart_would_deadlock, 67) \ - x(trans_restart_would_deadlock_write, 68) \ - x(trans_restart_injected, 69) \ - x(trans_restart_key_cache_upgrade, 70) \ - x(trans_traverse_all, 71) \ - x(transaction_commit, 72) \ - x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) \ - x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) \ - x(write_buffer_flush_slowpath, 77) \ - x(write_buffer_flush_sync, 78) - -enum bch_persistent_counters { -#define x(t, n, ...) BCH_COUNTER_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_NR -}; - -struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -}; - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index abdb05507d16..76e79a15ba08 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, next_key_bits -= 64; } - bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits)); + bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits)); if (!next_key_bits) break; diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 761f5e33b1e6..5e52684764eb 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, return 0; } +static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k); + + prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie)); +} + #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ .key_invalid = key_type_cookie_invalid, \ + .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index ee82283722b7..03efe8ee565a 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -83,9 +83,10 @@ enum btree_update_flags { __BTREE_TRIGGER_NORUN, __BTREE_TRIGGER_TRANSACTIONAL, + __BTREE_TRIGGER_ATOMIC, + __BTREE_TRIGGER_GC, __BTREE_TRIGGER_INSERT, __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_GC, __BTREE_TRIGGER_BUCKET_INVALIDATE, }; @@ -107,6 +108,10 @@ enum btree_update_flags { * causing us to go emergency read-only) */ #define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) +#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC) + +/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) /* @new is entering the btree */ #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) @@ -114,9 +119,6 @@ enum btree_update_flags { /* @old is leaving the btree */ #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) - /* signal from bucket invalidate path to alloc trigger */ #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 74bf8eb90a4c..3fd1085b6c61 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -720,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); struct bkey_i min_key, max_key; - unsigned j, cacheline = 1; + unsigned cacheline = 1; t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), bset_ro_tree_capacity(b, t)); @@ -823,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i) set_btree_bset(b, t, i); } -void bch2_bset_init_next(struct bch_fs *c, struct btree *b, - struct btree_node_entry *bne) +void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) { struct bset *i = &bne->keys; struct bset_tree *t; - BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); + BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); BUG_ON(b->nsets >= MAX_BSETS); diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 632c2b8c5460..79c77baaa383 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b, void bch2_btree_keys_init(struct btree *); void bch2_bset_init_first(struct btree *, struct bset *); -void bch2_bset_init_next(struct bch_fs *, struct btree *, - struct btree_node_entry *); +void bch2_bset_init_next(struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); void bch2_bset_insert(struct btree *, struct btree_node_iter *, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 8e2488a4b58d..d7c81beac14a 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_bytes(c), gfp); + b->data = kvpmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) bkey_btree_ptr_init(&b->key); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); - b->byte_order = ilog2(btree_bytes(c)); + b->byte_order = ilog2(c->opts.btree_node_size); return b; } @@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, btree_bytes(c)); + kvpfree(c->verify_ondisk, c->opts.btree_node_size); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); @@ -1192,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc " failed unpacked %zu\n", b->unpack_fn_len, b->nr.live_u64s * sizeof(u64), - btree_bytes(c) - sizeof(struct btree_node), + btree_buf_bytes(b) - sizeof(struct btree_node), b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 4e1af5882052..6d33885fdbde 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b) _iter = 0; _iter < (_tbl)->size; _iter++) \ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) -static inline size_t btree_bytes(struct bch_fs *c) +static inline size_t btree_buf_bytes(const struct btree *b) { - return c->opts.btree_node_size; + return 1UL << b->byte_order; } -static inline size_t btree_max_u64s(struct bch_fs *c) +static inline size_t btree_buf_max_u64s(const struct btree *b) { - return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); + return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64); } -static inline size_t btree_pages(struct bch_fs *c) +static inline size_t btree_max_u64s(const struct bch_fs *c) { - return btree_bytes(c) / PAGE_SIZE; + return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64); } -static inline unsigned btree_blocks(struct bch_fs *c) +static inline size_t btree_sectors(const struct bch_fs *c) +{ + return c->opts.btree_node_size >> SECTOR_SHIFT; +} + +static inline unsigned btree_blocks(const struct bch_fs *c) { return btree_sectors(c) >> c->block_bits; } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 49b4ade758c3..1102995643b1 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -597,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -615,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -637,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -649,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -664,8 +664,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->data_type], - bch2_data_types[data_type], + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { @@ -1238,11 +1238,11 @@ static int bch2_gc_done(struct bch_fs *c, for (i = 0; i < BCH_DATA_NR; i++) { copy_dev_field(dev_usage_buckets_wrong, - d[i].buckets, "%s buckets", bch2_data_types[i]); + d[i].buckets, "%s buckets", bch2_data_type_str(i)); copy_dev_field(dev_usage_sectors_wrong, - d[i].sectors, "%s sectors", bch2_data_types[i]); + d[i].sectors, "%s sectors", bch2_data_type_str(i)); copy_dev_field(dev_usage_fragmented_wrong, - d[i].fragmented, "%s fragmented", bch2_data_types[i]); + d[i].fragmented, "%s fragmented", bch2_data_type_str(i)); } } @@ -1253,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c, bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); copy_fs_field(fs_usage_hidden_wrong, - hidden, "hidden"); + b.hidden, "hidden"); copy_fs_field(fs_usage_btree_wrong, - btree, "btree"); + b.btree, "btree"); if (!metadata_only) { copy_fs_field(fs_usage_data_wrong, - data, "data"); + b.data, "data"); copy_fs_field(fs_usage_cached_wrong, - cached, "cached"); + b.cached, "cached"); copy_fs_field(fs_usage_reserved_wrong, - reserved, "reserved"); + b.reserved, "reserved"); copy_fs_field(fs_usage_nr_inodes_wrong, - nr_inodes,"nr_inodes"); + b.nr_inodes,"nr_inodes"); for (i = 0; i < BCH_REPLICAS_MAX; i++) copy_fs_field(fs_usage_persistent_reserved_wrong, @@ -1417,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %s, should be %s", iter->pos.inode, iter->pos.offset, gc.gen, - bch2_data_types[new.data_type], - bch2_data_types[gc.data_type])) + bch2_data_type_str(new.data_type), + bch2_data_type_str(gc.data_type))) new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ @@ -1428,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ - bch2_data_types[gc.data_type], \ + bch2_data_type_str(gc.data_type), \ new._f, gc._f)) \ new._f = gc._f; \ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 33db48e2153f..aa9b6cbe3226 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, unsigned flags = memalloc_nofs_save(); void *p; - BUG_ON(size > btree_bytes(c)); + BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); @@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ptrs = ptrs_end = ((void *) new_whiteouts + bytes); - for (k = unwritten_whiteouts_start(c, b); - k != unwritten_whiteouts_end(c, b); + for (k = unwritten_whiteouts_start(b); + k != unwritten_whiteouts_end(b); k = bkey_p_next(k)) *--ptrs = k; @@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) verify_no_dups(b, new_whiteouts, (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); - memcpy_u64s(unwritten_whiteouts_start(c, b), + memcpy_u64s(unwritten_whiteouts_start(b), new_whiteouts, b->whiteout_u64s); btree_bounce_free(c, bytes, used_mempool, new_whiteouts); @@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, } bytes = sorting_entire_node - ? btree_bytes(c) + ? btree_buf_bytes(b) : __vstruct_bytes(struct btree_node, u64s); out = btree_bounce_alloc(c, bytes, &used_mempool); @@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, if (sorting_entire_node) { u64s = le16_to_cpu(out->keys.u64s); - BUG_ON(bytes != btree_bytes(c)); + BUG_ON(bytes != btree_buf_bytes(b)); /* * Our temporary buffer is the same size as the btree node's @@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); @@ -1160,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ptr_written, b->written); } else { for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_bytes(c); + bset_byte_offset(b, bne) < btree_buf_bytes(b); bne = (void *) bne + block_bytes(c)) btree_err_on(bne->keys.seq == b->data->keys.seq && !bch2_journal_seq_is_blacklisted(c, @@ -1172,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "found bset signature after last bset"); } - sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); sorted->keys.u64s = 0; set_btree_bset(b, b->set, &b->data->keys); @@ -1188,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, BUG_ON(b->nr.live_u64s != u64s); - btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); if (updated_range) bch2_btree_node_drop_keys_outside_node(b); @@ -1284,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work) rb->have_ioref = bch2_dev_get_ioref(ca, READ); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); + bio->bi_iter.bi_size = btree_buf_bytes(b); if (rb->have_ioref) { bio_set_dev(bio, ca->disk_sb.bdev); @@ -1512,7 +1512,7 @@ fsck_err: } if (best >= 0) { - memcpy(b->data, ra->buf[best], btree_bytes(c)); + memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); } else { ret = -1; @@ -1578,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool for (i = 0; i < ra->nr; i++) { ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); ra->bio[i] = bio_alloc_bioset(NULL, - buf_pages(ra->buf[i], btree_bytes(c)), + buf_pages(ra->buf[i], btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1598,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool rb->pick = pick; rb->bio.bi_iter.bi_sector = pick.ptr.offset; rb->bio.bi_end_io = btree_node_read_all_replicas_endio; - bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -1665,7 +1665,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, ca = bch_dev_bkey_exists(c, pick.ptr.dev); bio = bio_alloc_bioset(NULL, - buf_pages(b->data, btree_bytes(c)), + buf_pages(b->data, btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1679,7 +1679,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_end_io = btree_node_read_endio; - bch2_bio_map(bio, b->data, btree_bytes(c)); + bch2_bio_map(bio, b->data, btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -2074,8 +2074,8 @@ do_write: i->u64s = 0; sort_iter_add(&sort_iter.iter, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); + unwritten_whiteouts_start(b), + unwritten_whiteouts_end(b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); b->whiteout_u64s = 0; @@ -2251,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index fa298289e016..5467a8635be1 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1337,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in if (path->should_be_locked && !trans->restarted && - (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) + (!dup || !bch2_btree_path_relock_norestart(trans, dup))) return; if (dup) { diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index da2b74fa63fc..24772538e4cc 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -819,6 +819,11 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) +/* + * This should not be used in a fastpath, without first trying _do in + * nonblocking mode - it will cause excessive transaction restarts and + * potentially livelocking: + */ #define drop_locks_do(_trans, _do) \ ({ \ bch2_trans_unlock(_trans); \ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 2d1c95c42f24..bed75c93c069 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -631,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, } __flatten -bool bch2_btree_path_relock_norestart(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { struct get_locks_fail f; @@ -642,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, int __bch2_btree_path_relock(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + if (!bch2_btree_path_relock_norestart(trans, path)) { trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); } @@ -759,12 +758,39 @@ int bch2_trans_relock(struct btree_trans *trans) if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path, i) + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; + if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { - trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); + !btree_path_get_locks(trans, path, false, &f)) { + if (trace_trans_restart_relock_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, " l=%u seq=%u node seq=", + f.l, path->l[f.l].lock_seq); + if (IS_ERR_OR_NULL(f.b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f.b))); + } else { + prt_printf(&buf, "%u", f.b->c.lock.seq); + + struct six_lock_count c = + bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l); + prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + + c = six_lock_counts(&f.b->c.lock); + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + } + + trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); + } + + count_event(trans->c, trans_restart_relock); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } + } + return 0; } @@ -778,7 +804,7 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) trans_for_each_path(trans, path, i) if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + !bch2_btree_path_relock_norestart(trans, path)) { return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } return 0; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index cc5500a957a1..4bd72c855da1 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -312,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *, /* relock: */ -bool bch2_btree_path_relock_norestart(struct btree_trans *, - struct btree_path *, unsigned long); +bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *); int __bch2_btree_path_relock(struct btree_trans *, struct btree_path *, unsigned long); @@ -353,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ - -struct get_locks_fail { - unsigned l; - struct btree *b; -}; - bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, struct btree_path *, unsigned, struct get_locks_fail *); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 90eb8065ff2d..30d69a6d133e 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -139,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, b)); + EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); k = bch2_btree_node_iter_peek_all(node_iter, b); @@ -160,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, k->type = KEY_TYPE_deleted; if (k->needs_whiteout) - push_whiteout(trans->c, b, insert->k.p); + push_whiteout(b, insert->k.p); k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { @@ -348,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans) static inline int btree_key_can_insert(struct btree_trans *trans, struct btree *b, unsigned u64s) { - struct bch_fs *c = trans->c; - - if (!bch2_btree_node_insert_fits(c, b, u64s)) + if (!bch2_btree_node_insert_fits(b, u64s)) return -BCH_ERR_btree_insert_btree_node_full; return 0; @@ -418,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_k)) return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); @@ -448,9 +445,6 @@ static int run_one_mem_trigger(struct btree_trans *trans, if (unlikely(flags & BTREE_TRIGGER_NORUN)) return 0; - if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) - return 0; - if (old_ops->trigger == new_ops->trigger) { ret = bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(new), @@ -586,9 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - int ret = 0; - trans_for_each_update(trans, i) { /* * XXX: synchronization of cached update triggers with gc @@ -596,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) */ BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(insert_l(trans, i)->b))) { - ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && + gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); if (ret) - break; + return ret; } } - return ret; + return 0; } static inline int @@ -680,6 +672,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); + h = trans->hooks; while (h) { ret = h->fn(trans, h); @@ -689,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, i->flags); + if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); if (ret) goto fatal_err; } @@ -994,6 +989,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) !trans->journal_entries_u64s) goto out_reset; + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + ret = bch2_trans_commit_run_triggers(trans); if (ret) goto out_reset; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index d530307046f4..4a5a64499eb7 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -430,6 +430,9 @@ struct btree_trans { struct journal_res journal_res; u64 *journal_seq; struct disk_reservation *disk_res; + + struct bch_fs_usage_base fs_usage_delta; + unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ struct replicas_delta_list *fs_usage_deltas; @@ -653,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_reflink)| \ BIT_ULL(BKEY_TYPE_btree)) -#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ +#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ (BIT_ULL(BKEY_TYPE_alloc)| \ BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ @@ -661,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ - BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { @@ -738,4 +741,9 @@ enum btree_node_sibling { btree_next_sib, }; +struct get_locks_fail { + unsigned l; + struct btree *b; +}; + #endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 44f9dfa28a09..17a5938aa71a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -159,7 +159,7 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, { size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); - return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); + return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); } /* Btree node freeing/allocation: */ @@ -1097,7 +1097,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(c, path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[update_level].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; @@ -1401,7 +1401,7 @@ static void __btree_split_node(struct btree_update *as, unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + nr_keys[i].val_u64s; - if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c)) + if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) n[i]->data->format = b->format; btree_node_set_format(n[i], n[i]->data->format); @@ -1703,7 +1703,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_prep_for_write(trans, path, b); - if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { bch2_btree_node_unlock_write(trans, path, b); goto split; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index adfc62083844..c593c925d1e3 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -184,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b) b->sib_u64s[1] = b->nr.live_u64s; } -static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +static inline void *btree_data_end(struct btree *b) { - return (void *) b->data + btree_bytes(c); + return (void *) b->data + btree_buf_bytes(b); } -static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b) { - return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); + return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s); } -static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b) { - return btree_data_end(c, b); + return btree_data_end(b); } static inline void *write_block(struct btree *b) @@ -221,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k) return __btree_addr_written(b, k); } -static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, - struct btree *b, - void *end) +static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end) { ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + b->whiteout_u64s; - ssize_t total = c->opts.btree_node_size >> 3; + ssize_t total = btree_buf_bytes(b) >> 3; /* Always leave one extra u64 for bch2_varint_decode: */ used++; @@ -235,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, return total - used; } -static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, - struct btree *b) +static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b) { - ssize_t remaining = __bch_btree_u64s_remaining(c, b, + ssize_t remaining = __bch2_btree_u64s_remaining(b, btree_bkey_last(b, bset_tree_last(b))); BUG_ON(remaining < 0); @@ -260,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b) return 8 << BTREE_WRITE_SET_U64s_BITS; } -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, - struct btree *b) +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = - __bch_btree_u64s_remaining(c, b, bne->keys.start); + __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) @@ -281,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, return NULL; } -static inline void push_whiteout(struct bch_fs *c, struct btree *b, - struct bpos pos) +static inline void push_whiteout(struct btree *b, struct bpos pos) { struct bkey_packed k; - BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s); EBUG_ON(btree_node_just_written(b)); if (!bkey_pack_pos(&k, pos, b)) { @@ -299,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, k.needs_whiteout = true; b->whiteout_u64s += k.u64s; - bkey_p_copy(unwritten_whiteouts_start(c, b), &k); + bkey_p_copy(unwritten_whiteouts_start(b), &k); } /* * write lock must be held on @b (else the dirty bset that we were going to * insert into could be written out from under us) */ -static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, - struct btree *b, unsigned u64s) +static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s) { if (unlikely(btree_node_need_rewrite(b))) return false; - return u64s <= bch_btree_keys_u64s_remaining(c, b); + return u64s <= bch2_btree_keys_u64s_remaining(b); } void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 5c1169c78daf..ac7844861966 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -125,13 +125,12 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite struct btree_write_buffered_key *wb, bool *write_locked, size_t *fast) { - struct bch_fs *c = trans->c; struct btree_path *path; int ret; EBUG_ON(!wb->journal_seq); - EBUG_ON(!c->btree_write_buffer.flushing.pin.seq); - EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); ret = bch2_btree_iter_traverse(iter); if (ret) @@ -155,7 +154,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite *write_locked = true; } - if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) { + if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) { *write_locked = false; return wb_flush_one_slowpath(trans, iter, wb); } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index d83ea0e53df3..54f7826ac498 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -25,7 +25,7 @@ #include <linux/preempt.h> -static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, +static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, enum bch_data_type data_type, s64 sectors) { @@ -54,20 +54,20 @@ void bch2_fs_usage_initialize(struct bch_fs *c) bch2_fs_usage_acc_to_base(c, i); for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) - usage->reserved += usage->persistent_reserved[i]; + usage->b.reserved += usage->persistent_reserved[i]; for (unsigned i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); + fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]); } for_each_member_device(c, ca) { struct bch_dev_usage dev = bch2_dev_usage_read(ca); - usage->hidden += (dev.d[BCH_DATA_sb].buckets + - dev.d[BCH_DATA_journal].buckets) * + usage->b.hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * ca->mi.bucket_size; } @@ -188,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out, prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); prt_printf(out, "hidden:\t\t\t\t%llu\n", - fs_usage->u.hidden); + fs_usage->u.b.hidden); prt_printf(out, "data:\t\t\t\t%llu\n", - fs_usage->u.data); + fs_usage->u.b.data); prt_printf(out, "cached:\t\t\t\t%llu\n", - fs_usage->u.cached); + fs_usage->u.b.cached); prt_printf(out, "reserved:\t\t\t%llu\n", - fs_usage->u.reserved); + fs_usage->u.b.reserved); prt_printf(out, "nr_inodes:\t\t\t%llu\n", - fs_usage->u.nr_inodes); + fs_usage->u.b.nr_inodes); prt_printf(out, "online reserved:\t\t%llu\n", fs_usage->online_reserved); @@ -225,10 +225,10 @@ static u64 reserve_factor(u64 r) u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) { - return min(fs_usage->u.hidden + - fs_usage->u.btree + - fs_usage->u.data + - reserve_factor(fs_usage->u.reserved + + return min(fs_usage->u.b.hidden + + fs_usage->u.b.btree + + fs_usage->u.b.data + + reserve_factor(fs_usage->u.b.reserved + fs_usage->online_reserved), c->capacity); } @@ -240,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c) u64 data, reserved; ret.capacity = c->capacity - - bch2_fs_usage_read_one(c, &c->usage_base->hidden); + bch2_fs_usage_read_one(c, &c->usage_base->b.hidden); - data = bch2_fs_usage_read_one(c, &c->usage_base->data) + - bch2_fs_usage_read_one(c, &c->usage_base->btree); - reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + + data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) + + bch2_fs_usage_read_one(c, &c->usage_base->b.btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; - ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); + ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes); return ret; } @@ -284,7 +284,7 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) prt_newline(out); for (unsigned i = 0; i < BCH_DATA_NR; i++) { - prt_str(out, bch2_data_types[i]); + bch2_prt_data_type(out, i); prt_tab(out); prt_u64(out, usage->d[i].buckets); prt_tab_rjust(out); @@ -308,9 +308,9 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, fs_usage = fs_usage_ptr(c, journal_seq, gc); if (data_type_is_hidden(old->data_type)) - fs_usage->hidden -= ca->mi.bucket_size; + fs_usage->b.hidden -= ca->mi.bucket_size; if (data_type_is_hidden(new->data_type)) - fs_usage->hidden += ca->mi.bucket_size; + fs_usage->b.hidden += ca->mi.bucket_size; u = dev_usage_ptr(ca, journal_seq, gc); @@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c, if (idx < 0) return -1; - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; return 0; } @@ -394,7 +394,7 @@ int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k, preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; preempt_enable(); err: @@ -523,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on(g->data_type && g->data_type != data_type, c, "different types of data in same bucket: %s, %s", - bch2_data_types[g->data_type], - bch2_data_types[data_type])) { + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) { ret = -EIO; goto err; } @@ -532,7 +532,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, - bch2_data_types[g->data_type ?: data_type], + bch2_data_type_str(g->data_type ?: data_type), g->dirty_sectors, sectors)) { ret = -EIO; goto err; @@ -575,7 +575,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -588,7 +588,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -603,7 +603,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "while marking %s", ptr->dev, bucket_nr, b_gen, *bucket_gen(ca, bucket_nr), - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -624,8 +624,8 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type], - bch2_data_types[ptr_data_type], + bch2_data_type_str(bucket_data_type), + bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -638,7 +638,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), bucket_sectors, sectors, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -677,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); } - dst->nr_inodes -= deltas->nr_inodes; + dst->b.nr_inodes -= deltas->nr_inodes; for (i = 0; i < BCH_REPLICAS_MAX; i++) { added -= deltas->persistent_reserved[i]; - dst->reserved -= deltas->persistent_reserved[i]; + dst->b.reserved -= deltas->persistent_reserved[i]; dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; } @@ -694,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, percpu_up_read(&c->mark_lock); } -int bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct replicas_delta_list *deltas) +void bch2_trans_account_disk_usage_change(struct btree_trans *trans) { struct bch_fs *c = trans->c; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; static int warned_disk_usage = 0; bool warn = false; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - struct replicas_delta *d, *d2; - struct replicas_delta *top = (void *) deltas->d + deltas->used; - struct bch_fs_usage *dst; - s64 added = 0, should_not_have_added; - unsigned i; percpu_down_read(&c->mark_lock); preempt_disable(); - dst = fs_usage_ptr(c, trans->journal_res.seq, false); - - for (d = deltas->d; d != top; d = replicas_delta_next(d)) { - switch (d->r.data_type) { - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - added += d->delta; - } + struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b; + struct bch_fs_usage_base *src = &trans->fs_usage_delta; - if (__update_replicas(c, dst, &d->r, d->delta)) - goto need_mark; - } - - dst->nr_inodes += deltas->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - added += deltas->persistent_reserved[i]; - dst->reserved += deltas->persistent_reserved[i]; - dst->persistent_reserved[i] += deltas->persistent_reserved[i]; - } + s64 added = src->btree + src->data + src->reserved; /* * Not allowed to reduce sectors_available except by getting a * reservation: */ - should_not_have_added = added - (s64) disk_res_sectors; + s64 should_not_have_added = added - (s64) disk_res_sectors; if (unlikely(should_not_have_added > 0)) { u64 old, new, v = atomic64_read(&c->sectors_available); @@ -754,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, this_cpu_sub(*c->online_reserved, added); } + dst->hidden += src->hidden; + dst->btree += src->btree; + dst->data += src->data; + dst->cached += src->cached; + dst->reserved += src->reserved; + dst->nr_inodes += src->nr_inodes; + preempt_enable(); percpu_up_read(&c->mark_lock); @@ -761,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, bch2_trans_inconsistent(trans, "disk usage increased %lli more than %llu sectors reserved)", should_not_have_added, disk_res_sectors); +} + +int bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + struct replicas_delta *d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + unsigned i; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + for (d = deltas->d; d != top; d = replicas_delta_next(d)) + if (__update_replicas(c, dst, &d->r, d->delta)) + goto need_mark; + + dst->b.nr_inodes += deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + dst->b.reserved += deltas->persistent_reserved[i]; + dst->persistent_reserved[i] += deltas->persistent_reserved[i]; + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); return 0; need_mark: /* revert changes: */ @@ -1084,7 +1096,7 @@ static int __trigger_reservation(struct btree_trans *trans, struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc); replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved)); - fs_usage->reserved += sectors; + fs_usage->b.reserved += sectors; fs_usage->persistent_reserved[replicas - 1] += sectors; preempt_enable(); @@ -1130,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - bch2_data_types[type], - bch2_data_types[type]); + bch2_data_type_str(a->v.data_type), + bch2_data_type_str(type), + bch2_data_type_str(type)); ret = -EIO; goto err; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 2c95cc5d86be..6387e039f789 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -356,6 +356,8 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, ret; \ }) +void bch2_trans_account_disk_usage_change(struct btree_trans *); + void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); @@ -385,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) return false; } +static inline const char *bch2_data_type_str(enum bch_data_type type) +{ + return type < BCH_DATA_NR + ? __bch2_data_types[type] + : "(invalid data type)"; +} + +static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) +{ + if (type < BCH_DATA_NR) + prt_str(out, __bch2_data_types[type]); + else + prt_printf(out, "(invalid data type %u)", type); +} + /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 783f71017204..6a31740222a7 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -45,23 +45,18 @@ struct bch_dev_usage { } d[BCH_DATA_NR]; }; -struct bch_fs_usage { - /* all fields are in units of 512 byte sectors: */ +struct bch_fs_usage_base { u64 hidden; u64 btree; u64 data; u64 cached; u64 reserved; u64 nr_inodes; +}; - /* XXX: add stats for compression ratio */ -#if 0 - u64 uncompressed; - u64 compressed; -#endif - - /* broken out: */ - +struct bch_fs_usage { + /* all fields are in units of 512 byte sectors: */ + struct bch_fs_usage_base b; u64 persistent_reserved[BCH_REPLICAS_MAX]; u64 replicas[]; }; diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index f41889093a2c..363644451106 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); - while (1) { + do { set_current_state(TASK_INTERRUPTIBLE); if (kthread && kthread_should_stop()) break; @@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, schedule(); try_to_freeze(); - } + } while (0); __set_current_state(TASK_RUNNING); del_timer_sync(&wait.cpu_timer); diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 607fd5e232c9..58c2eb45570f 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } +static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) +{ + if (type < BCH_COMPRESSION_TYPE_NR) + prt_str(out, __bch2_compression_types[type]); + else + prt_printf(out, "(invalid compression type %u)", type); +} + int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 6f13477ff652..4150feca42a2 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -285,9 +285,7 @@ restart_drop_extra_replicas: k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, insert, - op->opts.background_target, - op->opts.background_compression) ?: + bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, @@ -529,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_DATA_ENCODED| BCH_WRITE_MOVE| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; + m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; bkey_for_each_ptr(ptrs, ptr) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index d6418948495f..cadda9bbe4a4 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_sorted, btree_bytes(c)), + buf_pages(n_sorted, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_sorted, btree_bytes(c)); + bch2_bio_map(bio, n_sorted, btree_buf_bytes(b)); submit_bio_wait(bio); bio_put(bio); percpu_ref_put(&ca->io_ref); - memcpy(n_ondisk, n_sorted, btree_bytes(c)); + memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) @@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; } bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_ondisk, btree_bytes(c)), + buf_pages(n_ondisk, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_ondisk, btree_bytes(c)); + bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b)); ret = submit_bio_wait(bio); if (ret) { @@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_bytes(c)); + kvpfree(n_ondisk, btree_buf_bytes(b)); percpu_ref_put(&ca->io_ref); } diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h new file mode 100644 index 000000000000..5e116b88e814 --- /dev/null +++ b/fs/bcachefs/dirent_format.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DIRENT_FORMAT_H +#define _BCACHEFS_DIRENT_FORMAT_H + +/* + * Dirents (and xattrs) have to implement string lookups; since our b-tree + * doesn't support arbitrary length strings for the key, we instead index by a + * 64 bit hash (currently truncated sha1) of the string, stored in the offset + * field of the key - using linear probing to resolve hash collisions. This also + * provides us with the readdir cookie posix requires. + * + * Linear probing requires us to use whiteouts for deletions, in the event of a + * collision: + */ + +struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ + union { + __le64 d_inum; + struct { /* DT_SUBVOL */ + __le32 d_child_subvol; + __le32 d_parent_subvol; + }; + }; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get + * the filetype without having to do a stat() + */ + __u8 d_type; + + __u8 d_name[]; +} __packed __aligned(8); + +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + +#define BCH_NAME_MAX 512 + +#endif /* _BCACHEFS_DIRENT_FORMAT_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d802bc63c8d0..d503af270024 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -190,7 +190,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, a->v.stripe_redundancy, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], + bch2_data_type_str(a->v.data_type), a->v.dirty_sectors, a->v.stripe, s.k->p.offset)) { ret = -EIO; @@ -200,7 +200,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], + bch2_data_type_str(a->v.data_type), a->v.dirty_sectors, s.k->p.offset)) { ret = -EIO; @@ -367,7 +367,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, } } - if (!(flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))) { + if (flags & BTREE_TRIGGER_ATOMIC) { struct stripe *m = genradix_ptr(&c->stripes, idx); if (!m) { diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h new file mode 100644 index 000000000000..44ce88ba08d7 --- /dev/null +++ b/fs/bcachefs/ec_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_FORMAT_H +#define _BCACHEFS_EC_FORMAT_H + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_EC_FORMAT_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 82ec056f4cdb..61395b113df9 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "bkey_methods.h" +#include "btree_cache.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" @@ -1018,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type], - bch2_compression_types[crc.compression_type]); + bch2_csum_types[crc.csum_type]); + bch2_prt_compression_type(out, crc.compression_type); break; } case BCH_EXTENT_ENTRY_stripe_ptr: { @@ -1334,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) } int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, - unsigned target, unsigned compression) + struct bch_io_opts *opts) { struct bkey_s k = bkey_i_to_s(_k); struct bch_extent_rebalance *r; + unsigned target = opts->background_target; + unsigned compression = background_compression(*opts); bool needs_rebalance; if (!bkey_extent_is_direct_data(k.k)) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index a855c94d43dd..6bf839d69e84 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -708,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, - unsigned, unsigned); + struct bch_io_opts *); /* Generic extent code: */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h new file mode 100644 index 000000000000..3bd2fdbb0817 --- /dev/null +++ b/fs/bcachefs/extents_format.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_FORMAT_H +#define _BCACHEFS_EXTENTS_FORMAT_H + +/* + * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally + * preceded by checksum/compression information (bch_extent_crc32 or + * bch_extent_crc64). + * + * One major determining factor in the format of extents is how we handle and + * represent extents that have been partially overwritten and thus trimmed: + * + * If an extent is not checksummed or compressed, when the extent is trimmed we + * don't have to remember the extent we originally allocated and wrote: we can + * merely adjust ptr->offset to point to the start of the data that is currently + * live. The size field in struct bkey records the current (live) size of the + * extent, and is also used to mean "size of region on disk that we point to" in + * this case. + * + * Thus an extent that is not checksummed or compressed will consist only of a + * list of bch_extent_ptrs, with none of the fields in + * bch_extent_crc32/bch_extent_crc64. + * + * When an extent is checksummed or compressed, it's not possible to read only + * the data that is currently live: we have to read the entire extent that was + * originally written, and then return only the part of the extent that is + * currently live. + * + * Thus, in addition to the current size of the extent in struct bkey, we need + * to store the size of the originally allocated space - this is the + * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, + * when the extent is trimmed, instead of modifying the offset field of the + * pointer, we keep a second smaller offset field - "offset into the original + * extent of the currently live region". + * + * The other major determining factor is replication and data migration: + * + * Each pointer may have its own bch_extent_crc32/64. When doing a replicated + * write, we will initially write all the replicas in the same format, with the + * same checksum type and compression format - however, when copygc runs later (or + * tiering/cache promotion, anything that moves data), it is not in general + * going to rewrite all the pointers at once - one of the replicas may be in a + * bucket on one device that has very little fragmentation while another lives + * in a bucket that has become heavily fragmented, and thus is being rewritten + * sooner than the rest. + * + * Thus it will only move a subset of the pointers (or in the case of + * tiering/cache promotion perhaps add a single pointer without dropping any + * current pointers), and if the extent has been partially overwritten it must + * write only the currently live portion (or copygc would not be able to reduce + * fragmentation!) - which necessitates a different bch_extent_crc format for + * the new pointer. + * + * But in the interests of space efficiency, we don't want to store one + * bch_extent_crc for each pointer if we don't have to. + * + * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and + * bch_extent_ptrs appended arbitrarily one after the other. We determine the + * type of a given entry with a scheme similar to utf8 (except we're encoding a + * type, not a size), encoding the type in the position of the first set bit: + * + * bch_extent_crc32 - 0b1 + * bch_extent_ptr - 0b10 + * bch_extent_crc64 - 0b100 + * + * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and + * bch_extent_crc64 is the least constrained). + * + * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, + * until the next bch_extent_crc32/64. + * + * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer + * is neither checksummed nor compressed. + */ + +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ + x(crc64, 2) \ + x(crc128, 3) \ + x(stripe_ptr, 4) \ + x(rebalance, 5) +#define BCH_EXTENT_ENTRY_MAX 6 + +enum bch_extent_entry_type { +#define x(f, n) BCH_EXTENT_ENTRY_##f = n, + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +/* Compressed/uncompressed size are stored biased by 1: */ +struct bch_extent_crc32 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 type:2, + _compressed_size:7, + _uncompressed_size:7, + offset:7, + _unused:1, + csum_type:4, + compression_type:4; + __u32 csum; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u32 csum; + __u32 compression_type:4, + csum_type:4, + _unused:1, + offset:7, + _uncompressed_size:7, + _compressed_size:7, + type:2; +#endif +} __packed __aligned(8); + +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 + +struct bch_extent_crc64 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:3, + _compressed_size:9, + _uncompressed_size:9, + offset:9, + nonce:10, + csum_type:4, + compression_type:4, + csum_hi:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_hi:16, + compression_type:4, + csum_type:4, + nonce:10, + offset:9, + _uncompressed_size:9, + _compressed_size:9, + type:3; +#endif + __u64 csum_lo; +} __packed __aligned(8); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) + +struct bch_extent_crc128 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:4, + _compressed_size:13, + _uncompressed_size:13, + offset:13, + nonce:13, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 compression_type:4, + csum_type:4, + nonce:13, + offset:13, + _uncompressed_size:13, + _compressed_size:13, + type:4; +#endif + struct bch_csum csum; +} __packed __aligned(8); + +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) + +/* + * @reservation - pointer hasn't been written to, just reserved + */ +struct bch_extent_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:1, + cached:1, + unused:1, + unwritten:1, + offset:44, /* 8 petabytes */ + dev:8, + gen:8; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 gen:8, + dev:8, + offset:44, + unwritten:1, + unused:1, + cached:1, + type:1; +#endif +} __packed __aligned(8); + +struct bch_extent_stripe_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + block:8, + redundancy:4, + idx:47; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:47, + redundancy:4, + block:8, + type:5; +#endif +}; + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:34, + compression:8, /* enum bch_compression_opt */ + target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 target:16, + compression:8, + unused:34, + type:6; +#endif +}; + +union bch_extent_entry { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 + unsigned long type; +#elif __BITS_PER_LONG == 32 + struct { + unsigned long pad; + unsigned long type; + }; +#else +#error edit for your odd byteorder. +#endif + +#define x(f, n) struct bch_extent_##f f; + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +struct bch_btree_ptr { + struct bch_val v; + + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +struct bch_btree_ptr_v2 { + struct bch_val v; + + __u64 mem_ptr; + __le64 seq; + __le16 sectors_written; + __le16 flags; + struct bpos min_key; + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + +struct bch_extent { + struct bch_val v; + + __u64 _data[0]; + union bch_extent_entry start[]; +} __packed __aligned(8); + +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ + ((sizeof(struct bch_extent_crc128) + \ + sizeof(struct bch_extent_ptr)) / sizeof(__u64)) + +/* Maximum possible size of an entire extent value: */ +#define BKEY_EXTENT_VAL_U64s_MAX \ + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + +/* * Maximum possible size of an entire extent, key + value: */ +#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +/* Btree pointers don't carry around checksums: */ +#define BKEY_BTREE_PTR_VAL_U64s_MAX \ + ((sizeof(struct bch_btree_ptr_v2) + \ + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) +#define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + +struct bch_reservation { + struct bch_val v; + + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; +} __packed __aligned(8); + +struct bch_inline_data { + struct bch_val v; + u8 data[]; +}; + +#endif /* _BCACHEFS_EXTENTS_FORMAT_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 9637f636e32d..b04750dbf870 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) } #define eytzinger1_for_each(_i, _size) \ - for ((_i) = eytzinger1_first((_size)); \ + for (unsigned (_i) = eytzinger1_first((_size)); \ (_i) != 0; \ (_i) = eytzinger1_next((_i), (_size))) @@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) } #define eytzinger0_for_each(_i, _size) \ - for ((_i) = eytzinger0_first((_size)); \ + for (unsigned (_i) = eytzinger0_first((_size)); \ (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index fdd57c5785c9..e3b219e19e10 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -77,6 +77,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) bch2_inode_opts_get(&opts, c, &inode->ei_inode); + /* bios must be 512 byte aligned: */ + if ((offset|iter->count) & (SECTOR_SIZE - 1)) + return -EINVAL; + ret = min_t(loff_t, iter->count, max_t(loff_t, 0, i_size_read(&inode->v) - offset)); diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index ff664fd0d8ef..d359aa9b33b8 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, } } -void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, - u64 start, u64 end) +int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + u64 *start, u64 end, + bool nonblocking) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t index = *start >> PAGE_SECTORS_SHIFT; pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; struct folio_batch fbatch; s64 i_sectors_delta = 0; - unsigned i, j; + int ret = 0; - if (end <= start) - return; + if (end <= *start) + return 0; folio_batch_init(&fbatch); while (filemap_get_folios(inode->v.i_mapping, &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { + for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; + + if (!nonblocking) + folio_lock(folio); + else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + ret = -EAGAIN; + break; + } + u64 folio_start = folio_sector(folio); u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; BUG_ON(end <= folio_start); - folio_lock(folio); - s = bch2_folio(folio); + *start = min(end, folio_end); + struct bch_folio *s = bch2_folio(folio); if (s) { + unsigned folio_offset = max(*start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) { + for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { i_sectors_delta -= s->s[j].state == SECTOR_dirty; bch2_folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); @@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, } bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + return ret; } static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 27f712ae37a6..8cbaba6565b4 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); -void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); +int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool); int bch2_get_folio_disk_reservation(struct bch_fs *, struct bch_inode_info *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 98bd5babab19..dc52918d06ef 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -675,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - drop_locks_do(trans, - (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); + if (bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, true)) + drop_locks_do(trans, + bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, false)); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 1cbc5807bc80..3a4c24c28e7f 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -337,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) create_flags |= BCH_CREATE_SNAPSHOT_RO; - /* why do we need this lock? */ - down_read(&c->vfs_sb->s_umount); - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) { + /* sync_inodes_sb enforce s_umount is locked */ + down_read(&c->vfs_sb->s_umount); sync_inodes_sb(c->vfs_sb); + up_read(&c->vfs_sb->s_umount); + } retry: if (arg.src_ptr) { error = user_path_at(arg.dirfd, @@ -425,8 +426,6 @@ err2: goto retry; } err1: - up_read(&c->vfs_sb->s_umount); - return error; } diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 37dce96f48ac..086f0090b03a 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -506,22 +506,33 @@ fsck_err: static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "mode=%o ", inode->bi_mode); + printbuf_indent_add(out, 2); + prt_printf(out, "mode=%o", inode->bi_mode); + prt_newline(out); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); prt_printf(out, " (%x)", inode->bi_flags); + prt_newline(out); - prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", - inode->bi_journal_seq, - inode->bi_size, - inode->bi_sectors, - inode->bi_version); + prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq); + prt_newline(out); + + prt_printf(out, "bi_size=%llu", inode->bi_size); + prt_newline(out); + + prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); + prt_newline(out); + + prt_newline(out); + prt_printf(out, "bi_version=%llu", inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, " "#_name "=%llu", (u64) inode->_name); + prt_printf(out, #_name "=%llu", (u64) inode->_name); \ + prt_newline(out); BCH_INODE_FIELDS_v3() #undef x + printbuf_indent_sub(out, 2); } void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) @@ -587,7 +598,7 @@ int bch2_trigger_inode(struct btree_trans *trans, } } - if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); @@ -597,7 +608,7 @@ int bch2_trigger_inode(struct btree_trans *trans, struct bch_fs *c = trans->c; percpu_down_read(&c->mark_lock); - this_cpu_add(c->usage_gc->nr_inodes, nr); + this_cpu_add(c->usage_gc->b.nr_inodes, nr); percpu_up_read(&c->mark_lock); } diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h new file mode 100644 index 000000000000..83d107331edf --- /dev/null +++ b/fs/bcachefs/inode_format.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_FORMAT_H +#define _BCACHEFS_INODE_FORMAT_H + +#define BLOCKDEV_INODE_MAX 4096 +#define BCACHEFS_ROOT_INO 4096 + +struct bch_inode { + struct bch_val v; + + __le64 bi_hash_seed; + __le32 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v2 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v3 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le64 bi_sectors; + __le64 bi_size; + __le64 bi_version; + __u8 fields[]; +} __packed __aligned(8); + +#define INODEv3_FIELDS_START_INITIAL 6 +#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) + +struct bch_inode_generation { + struct bch_val v; + + __le32 bi_generation; + __le32 pad; +} __packed __aligned(8); + +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + +#define BCH_INODE_FIELDS_v2() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) + +#define BCH_INODE_FIELDS_v3() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) \ + x(bi_nocow, 8) + +/* subset of BCH_INODE_FIELDS */ +#define BCH_INODE_OPTS() \ + x(data_checksum, 8) \ + x(compression, 8) \ + x(project, 32) \ + x(background_compression, 8) \ + x(data_replicas, 8) \ + x(promote_target, 16) \ + x(foreground_target, 16) \ + x(background_target, 16) \ + x(erasure_code, 16) \ + x(nocow, 8) + +enum inode_opt_id { +#define x(name, ...) \ + Inode_opt_##name, + BCH_INODE_OPTS() +#undef x + Inode_opt_nr, +}; + +#define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ + x(append, 2) \ + x(nodump, 3) \ + x(noatime, 4) \ + x(i_size_dirty, 5) \ + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) + +/* bits 20+ reserved for packed fields below: */ + +enum bch_inode_flags { +#define x(t, n) BCH_INODE_##t = 1U << n, + BCH_INODE_FLAGS() +#undef x +}; + +enum __bch_inode_flags { +#define x(t, n) __BCH_INODE_##t = n, + BCH_INODE_FLAGS() +#undef x +}; + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); +LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_FIELDS_START, + struct bch_inode_v3, bi_flags, 31, 36); +LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); + +#endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index ca6d5f516aa2..1baf78594cca 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -442,9 +442,7 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, copy, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 33c0e783d546..ef3a53f9045a 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_bkey_set_needs_rebalance(c, sk.k, - op->opts.background_target, - op->opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, @@ -1447,10 +1445,11 @@ err: op->flags |= BCH_WRITE_DONE; if (ret < 0) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s(): error: %s", __func__, bch2_err_str(ret)); op->error = ret; break; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 8538ef34f62b..d71d26e39521 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -27,6 +27,47 @@ static const char * const bch2_journal_errors[] = { NULL }; +static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) +{ + union journal_res_state s = READ_ONCE(j->reservations); + unsigned i = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + i; + + prt_printf(out, "seq:"); + prt_tab(out); + prt_printf(out, "%llu", seq); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "refcount:"); + prt_tab(out); + prt_printf(out, "%u", journal_state_count(s, i)); + prt_newline(out); + + prt_printf(out, "size:"); + prt_tab(out); + prt_human_readable_u64(out, vstruct_bytes(buf->data)); + prt_newline(out); + + prt_printf(out, "expires"); + prt_tab(out); + prt_printf(out, "%li jiffies", buf->expires - jiffies); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) + bch2_journal_buf_to_text(out, j, seq); +} + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -156,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) * We don't close a journal_buf until the next journal_buf is finished writing, * and can be opened again - this also initializes the next journal_buf: */ -static void __journal_entry_close(struct journal *j, unsigned closed_val) +static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); @@ -185,7 +226,17 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - trace_journal_entry_close(c, vstruct_bytes(buf->data)); + if (trace_journal_entry_close_enabled() && trace) { + struct printbuf pbuf = PRINTBUF; + pbuf.atomic++; + + prt_str(&pbuf, "entry size: "); + prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); + prt_newline(&pbuf); + bch2_prt_task_backtrace(&pbuf, current, 1); + trace_journal_entry_close(c, pbuf.buf); + printbuf_exit(&pbuf); + } sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; @@ -225,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) void bch2_journal_halt(struct journal *j) { spin_lock(&j->lock); - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); if (!j->err_seq) j->err_seq = journal_cur_seq(j); journal_wake(j); @@ -239,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j) /* Don't close it yet if we already have a write in flight: */ if (ret) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); else if (nr_unwritten_journal_entries(j)) { struct journal_buf *buf = journal_cur_buf(j); @@ -406,7 +457,7 @@ static void journal_write_work(struct work_struct *work) if (delta > 0) mod_delayed_work(c->io_complete_wq, &j->write_work, delta); else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); unlock: spin_unlock(&j->lock); } @@ -463,13 +514,21 @@ retry: buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); ret = journal_entry_open(j); if (ret == JOURNAL_ERR_max_in_flight) { track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], &j->max_in_flight_start, true); - trace_and_count(c, journal_entry_full, c); + if (trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + } + count_event(c, journal_entry_full); } unlock: can_discard = j->can_discard; @@ -549,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j, /* * Not enough room in current journal entry, have to flush it: */ - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); } else { journal_cur_buf(j)->u64s_reserved += d; } @@ -606,7 +665,7 @@ recheck_need_open: struct journal_res res = { 0 }; if (journal_entry_is_open(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); spin_unlock(&j->lock); @@ -786,7 +845,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou if (buf->need_flush_to_write_buffer) { if (seq == journal_cur_seq(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); union journal_res_state s; s.v = atomic64_read_acquire(&j->reservations.counter); @@ -1339,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } prt_newline(out); - - for (u64 seq = journal_cur_seq(j); - seq >= journal_last_unwritten_seq(j); - --seq) { - unsigned i = seq & JOURNAL_BUF_MASK; - - prt_printf(out, "unwritten entry:"); - prt_tab(out); - prt_printf(out, "%llu", seq); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "refcount:"); - prt_tab(out); - prt_printf(out, "%u", journal_state_count(s, i)); - prt_newline(out); - - prt_printf(out, "sectors:"); - prt_tab(out); - prt_printf(out, "%u", j->buf[i].sectors); - prt_newline(out); - - prt_printf(out, "expires"); - prt_tab(out); - prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); - prt_newline(out); - - printbuf_indent_sub(out, 2); - } + prt_printf(out, "unwritten entries:"); + prt_newline(out); + bch2_journal_bufs_to_text(out, j); prt_printf(out, "replay done:\t\t%i\n", diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index b0f4dd491e12..04a1e79a5ed3 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -683,10 +683,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); for (i = 0; i < nr_types; i++) { - if (i < BCH_DATA_NR) - prt_printf(out, " %s", bch2_data_types[i]); - else - prt_printf(out, " (unknown data type %u)", i); + bch2_prt_data_type(out, i); prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", le64_to_cpu(u->d[i].buckets), le64_to_cpu(u->d[i].sectors), diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h new file mode 100644 index 000000000000..6a4bf7129dba --- /dev/null +++ b/fs/bcachefs/logged_ops_format.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H +#define _BCACHEFS_LOGGED_OPS_FORMAT_H + +struct bch_logged_op_truncate { + struct bch_val v; + __le32 subvol; + __le32 pad; + __le64 inum; + __le64 new_i_size; +}; + +enum logged_op_finsert_state { + LOGGED_OP_FINSERT_start, + LOGGED_OP_FINSERT_shift_extents, + LOGGED_OP_FINSERT_finish, +}; + +struct bch_logged_op_finsert { + struct bch_val v; + __u8 state; + __u8 pad[3]; + __le32 subvol; + __le64 inum; + __le64 dst_offset; + __le64 src_offset; + __le64 pos; +}; + +#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 7a33319dcd16..bf68ea49447b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -6,9 +6,11 @@ #include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" +#include "btree_io.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" @@ -34,12 +36,46 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) +static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + prt_str(out, "rewrite ptrs:"); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str(out, "kill ptrs: "); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str(out, "target: "); + prt_tab(out); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str(out, "compression: "); + prt_tab(out); + bch2_compression_opt_to_text(out, background_compression(*io_opts)); + prt_newline(out); + + prt_str(out, "extra replicas: "); + prt_tab(out); + prt_u64(out, data_opts->extra_replicas); +} + +static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { if (trace_move_extent_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); trace_move_extent(c, buf.buf); printbuf_exit(&buf); } @@ -111,6 +147,15 @@ static void move_write(struct moving_io *io) return; } + if (trace_move_extent_write_enabled()) { + struct bch_fs *c = io->write.op.c; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); + trace_move_extent_write(c, buf.buf); + printbuf_exit(&buf); + } + closure_get(&io->write.ctxt->cl); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); @@ -241,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + trace_move_extent2(c, k, &io_opts, &data_opts); + if (ctxt->stats) ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); - trace_move_extent2(c, k); bch2_data_update_opts_normalize(k, &data_opts); @@ -759,6 +805,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (!b) goto next; + unsigned sectors = btree_ptr_sectors_written(&b->key); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); bch2_trans_iter_exit(trans, &iter); @@ -768,11 +816,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, goto err; if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, - c->opts.btree_node_size >> 9); + bch2_ratelimit_increment(ctxt->rate, sectors); if (ctxt->stats) { - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + atomic64_add(sectors, &ctxt->stats->sectors_seen); + atomic64_add(sectors, &ctxt->stats->sectors_moved); } } next: @@ -1083,9 +1130,9 @@ int bch2_data_job(struct bch_fs *c, void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) { - prt_printf(out, "%s: data type=%s pos=", - stats->name, - bch2_data_types[stats->data_type]); + prt_printf(out, "%s: data type==", stats->name); + bch2_prt_data_type(out, stats->data_type); + prt_str(out, " pos="); bch2_bbpos_to_text(out, stats->pos); prt_newline(out); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 8e6f230eac38..b1ed0b9a20d3 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = { NULL }; -const char * const bch2_compression_types[] = { +const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; @@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = { NULL }; -const char * const bch2_data_types[] = { +const char * const __bch2_data_types[] = { BCH_DATA_TYPES() NULL }; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 93a24fef4214..9a4b7faa3765 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; -extern const char * const bch2_compression_types[]; +extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; -extern const char * const bch2_data_types[]; +extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; extern const char * const bch2_jset_entry_types[]; extern const char * const bch2_fs_usage_types[]; @@ -564,6 +564,11 @@ struct bch_io_opts { #undef x }; +static inline unsigned background_compression(struct bch_io_opts opts) +{ + return opts.background_compression ?: opts.compression; +} + struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h new file mode 100644 index 000000000000..dc34347ef6c7 --- /dev/null +++ b/fs/bcachefs/quota_format.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_FORMAT_H +#define _BCACHEFS_QUOTA_FORMAT_H + +/* KEY_TYPE_quota: */ + +enum quota_types { + QTYP_USR = 0, + QTYP_GRP = 1, + QTYP_PRJ = 2, + QTYP_NR = 3, +}; + +enum quota_counters { + Q_SPC = 0, + Q_INO = 1, + Q_COUNTERS = 2, +}; + +struct bch_quota_counter { + __le64 hardlimit; + __le64 softlimit; +}; + +struct bch_quota { + struct bch_val v; + struct bch_quota_counter c[Q_COUNTERS]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_quota: */ + +struct bch_sb_quota_counter { + __le32 timelimit; + __le32 warnlimit; +}; + +struct bch_sb_quota_type { + __le64 flags; + struct bch_sb_quota_counter c[Q_COUNTERS]; +}; + +struct bch_sb_field_quota { + struct bch_sb_field field; + struct bch_sb_quota_type q[QTYP_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_QUOTA_FORMAT_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 95f46cb3b5bd..22d1017aa49b 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -177,8 +177,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, prt_str(&buf, "target="); bch2_target_to_text(&buf, c, r->target); prt_str(&buf, " compression="); - struct bch_compression_opt opt = __bch2_compression_decode(r->compression); - prt_str(&buf, bch2_compression_opts[opt.type]); + bch2_compression_opt_to_text(&buf, r->compression); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, k); @@ -254,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, if (k.k->p.inode) { target = io_opts->background_target; - compression = io_opts->background_compression ?: io_opts->compression; + compression = background_compression(*io_opts); } else { const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); target = r ? r->target : io_opts->background_target; - compression = r ? r->compression : - (io_opts->background_compression ?: io_opts->compression); + compression = r ? r->compression : background_compression(*io_opts); } data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); @@ -371,6 +369,7 @@ static int do_rebalance(struct moving_context *ctxt) !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && !atomic64_read(&r->scan_stats.sectors_seen)) { + bch2_moving_ctxt_flush_all(ctxt); bch2_trans_unlock_long(trans); rebalance_wait(c); } @@ -385,7 +384,6 @@ static int bch2_rebalance_thread(void *arg) struct bch_fs *c = arg; struct bch_fs_rebalance *r = &c->rebalance; struct moving_context ctxt; - int ret; set_freezable(); @@ -393,8 +391,7 @@ static int bch2_rebalance_thread(void *arg) writepoint_ptr(&c->rebalance_write_point), true); - while (!kthread_should_stop() && - !(ret = do_rebalance(&ctxt))) + while (!kthread_should_stop() && !do_rebalance(&ctxt)) ; bch2_moving_ctxt_exit(&ctxt); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 725214605a05..9127d0e3ca2f 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -280,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(u->v); break; case BCH_FS_USAGE_inodes: - c->usage_base->nr_inodes = le64_to_cpu(u->v); + c->usage_base->b.nr_inodes = le64_to_cpu(u->v); break; case BCH_FS_USAGE_key_version: atomic64_set(&c->key_version, diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index faa5d3670058..c47c66c2b394 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -292,10 +292,10 @@ static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *f } } -int bch2_trans_mark_reflink_v(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - unsigned flags) +int bch2_trigger_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) @@ -324,7 +324,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, min(datalen, 32U), d.v->data); } -int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, +int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, unsigned flags) @@ -486,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + if (dst_inum.inum < src_inum.inum) { + /* Avoid some lock cycle transaction restarts */ + ret = bch2_btree_iter_traverse(&dst_iter); + if (ret) + continue; + } + dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); bch2_btree_iter_set_pos(&src_iter, src_want); @@ -538,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: bch2_extent_update(trans, dst_inum, &dst_iter, new_dst.k, &disk_res, new_i_size, i_sectors_delta, diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 8ee778ec0022..4d8867289717 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -24,14 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, +int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ - .trigger = bch2_trans_mark_reflink_v, \ + .trigger = bch2_trigger_reflink_v, \ .min_val_size = 8, \ }) @@ -39,7 +39,7 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_indirect_inline_data(struct btree_trans *, +int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, unsigned); @@ -47,7 +47,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *, #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ .val_to_text = bch2_indirect_inline_data_to_text, \ - .trigger = bch2_trans_mark_indirect_inline_data, \ + .trigger = bch2_trigger_indirect_inline_data, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h new file mode 100644 index 000000000000..6772eebb1fc6 --- /dev/null +++ b/fs/bcachefs/reflink_format.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_FORMAT_H +#define _BCACHEFS_REFLINK_FORMAT_H + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of + * the original indirect extent, and then one of the fragments is + * outside the range we point to, we'd leak a refcount: so when creating + * reflink pointers, we need to store pad values to remember the full + * range we were taking a reference on. + */ + __le32 front_pad; + __le32 back_pad; +} __packed __aligned(8); + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[]; +} __packed __aligned(8); + +struct bch_indirect_inline_data { + struct bch_val v; + __le64 refcount; + u8 data[]; +}; + +#endif /* _BCACHEFS_REFLINK_FORMAT_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 92ba56ef1fc8..cc2672c12031 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -9,6 +9,12 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); +/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ +static int bch2_memcmp(const void *l, const void *r, size_t size) +{ + return memcmp(l, r, size); +} + /* Replicas tracking - in memory: */ static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) @@ -33,21 +39,16 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); + eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, struct bch_replicas_entry_v0 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u [", e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } @@ -55,15 +56,10 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out, void bch2_replicas_entry_to_text(struct printbuf *out, struct bch_replicas_entry_v1 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } @@ -831,7 +827,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, sort_cmp_size(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, - memcmp, NULL); + bch2_memcmp, NULL); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 9632f36f5f31..b6bf0ebe7e84 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -207,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = BCH_FS_USAGE_inodes; - u->v = cpu_to_le64(c->usage_base->nr_inodes); + u->v = cpu_to_le64(c->usage_base->b.nr_inodes); } { diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c index 02a996e06a64..7dc898761bb3 100644 --- a/fs/bcachefs/counters.c +++ b/fs/bcachefs/sb-counters.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "super-io.h" -#include "counters.h" +#include "sb-counters.h" /* BCH_SB_FIELD_counters */ diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h index 4778aa19bf34..81f8aec9fcb1 100644 --- a/fs/bcachefs/counters.h +++ b/fs/bcachefs/sb-counters.h @@ -1,11 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_COUNTERS_H -#define _BCACHEFS_COUNTERS_H +#ifndef _BCACHEFS_SB_COUNTERS_H +#define _BCACHEFS_SB_COUNTERS_H #include "bcachefs.h" #include "super-io.h" - int bch2_sb_counters_to_cpu(struct bch_fs *); int bch2_sb_counters_from_cpu(struct bch_fs *); @@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_counters; -#endif // _BCACHEFS_COUNTERS_H +#endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h new file mode 100644 index 000000000000..62ea478215d0 --- /dev/null +++ b/fs/bcachefs/sb-counters_format.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H +#define _BCACHEFS_SB_COUNTERS_FORMAT_H + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) \ + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) \ + x(bucket_alloc, 5) \ + x(bucket_alloc_fail, 6) \ + x(btree_cache_scan, 7) \ + x(btree_cache_reap, 8) \ + x(btree_cache_cannibalize, 9) \ + x(btree_cache_cannibalize_lock, 10) \ + x(btree_cache_cannibalize_lock_fail, 11) \ + x(btree_cache_cannibalize_unlock, 12) \ + x(btree_node_write, 13) \ + x(btree_node_read, 14) \ + x(btree_node_compact, 15) \ + x(btree_node_merge, 16) \ + x(btree_node_split, 17) \ + x(btree_node_rewrite, 18) \ + x(btree_node_alloc, 19) \ + x(btree_node_free, 20) \ + x(btree_node_set_root, 21) \ + x(btree_path_relock_fail, 22) \ + x(btree_path_upgrade_fail, 23) \ + x(btree_reserve_get_fail, 24) \ + x(journal_entry_full, 25) \ + x(journal_full, 26) \ + x(journal_reclaim_finish, 27) \ + x(journal_reclaim_start, 28) \ + x(journal_write, 29) \ + x(read_promote, 30) \ + x(read_bounce, 31) \ + x(read_split, 33) \ + x(read_retry, 32) \ + x(read_reuse_race, 34) \ + x(move_extent_read, 35) \ + x(move_extent_write, 36) \ + x(move_extent_finish, 37) \ + x(move_extent_fail, 38) \ + x(move_extent_start_fail, 39) \ + x(copygc, 40) \ + x(copygc_wait, 41) \ + x(gc_gens_end, 42) \ + x(gc_gens_start, 43) \ + x(trans_blocked_journal_reclaim, 44) \ + x(trans_restart_btree_node_reused, 45) \ + x(trans_restart_btree_node_split, 46) \ + x(trans_restart_fault_inject, 47) \ + x(trans_restart_iter_upgrade, 48) \ + x(trans_restart_journal_preres_get, 49) \ + x(trans_restart_journal_reclaim, 50) \ + x(trans_restart_journal_res_get, 51) \ + x(trans_restart_key_cache_key_realloced, 52) \ + x(trans_restart_key_cache_raced, 53) \ + x(trans_restart_mark_replicas, 54) \ + x(trans_restart_mem_realloced, 55) \ + x(trans_restart_memory_allocation_failure, 56) \ + x(trans_restart_relock, 57) \ + x(trans_restart_relock_after_fill, 58) \ + x(trans_restart_relock_key_cache_fill, 59) \ + x(trans_restart_relock_next_node, 60) \ + x(trans_restart_relock_parent_for_fill, 61) \ + x(trans_restart_relock_path, 62) \ + x(trans_restart_relock_path_intent, 63) \ + x(trans_restart_too_many_iters, 64) \ + x(trans_restart_traverse, 65) \ + x(trans_restart_upgrade, 66) \ + x(trans_restart_would_deadlock, 67) \ + x(trans_restart_would_deadlock_write, 68) \ + x(trans_restart_injected, 69) \ + x(trans_restart_key_cache_upgrade, 70) \ + x(trans_traverse_all, 71) \ + x(transaction_commit, 72) \ + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) \ + x(trans_restart_write_buffer_flush, 75) \ + x(trans_restart_split_race, 76) \ + x(write_buffer_flush_slowpath, 77) \ + x(write_buffer_flush_sync, 78) + +enum bch_persistent_counters { +#define x(t, n, ...) BCH_COUNTER_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_NR +}; + +struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[]; +}; + +#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index a44a238bf8b5..a45354d2acde 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -251,7 +251,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Data allowed:"); prt_tab(out); if (BCH_MEMBER_DATA_ALLOWED(&m)) - prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); + prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); @@ -259,7 +259,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Has data:"); prt_tab(out); if (data_have) - prt_bitflags(out, bch2_data_types, data_have); + prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 56af937523ff..45f67e8b29eb 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1053,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, n->v.subvol = cpu_to_le32(snapshot_subvols[i]); n->v.tree = cpu_to_le32(tree); n->v.depth = cpu_to_le32(depth); + n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + n->v.btime.hi = 0; for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); @@ -1681,5 +1683,5 @@ int bch2_snapshots_read(struct bch_fs *c) void bch2_fs_snapshots_exit(struct bch_fs *c) { - kfree(rcu_dereference_protected(c->snapshots, true)); + kvfree(rcu_dereference_protected(c->snapshots, true)); } diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h new file mode 100644 index 000000000000..aabcd3a74cd9 --- /dev/null +++ b/fs/bcachefs/snapshot_format.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H +#define _BCACHEFS_SNAPSHOT_FORMAT_H + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ + __le32 tree; + __le32 depth; + __le32 skip[3]; + bch_le128 btime; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + +/* + * Snapshot trees: + * + * The snapshot_trees btree gives us persistent indentifier for each tree of + * bch_snapshot nodes, and allow us to record and easily find the root/master + * subvolume that other snapshots were created from: + */ +struct bch_snapshot_tree { + struct bch_val v; + __le32 master_subvol; + __le32 root_snapshot; +}; + +#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */ diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h new file mode 100644 index 000000000000..af79134b07d6 --- /dev/null +++ b/fs/bcachefs/subvolume_format.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H +#define _BCACHEFS_SUBVOLUME_FORMAT_H + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; + /* + * Snapshot subvolumes form a tree, separate from the snapshot nodes + * tree - if this subvolume is a snapshot, this is the ID of the + * subvolume it was created from: + * + * This is _not_ necessarily the subvolume of the directory containing + * this subvolume: + */ + __le32 parent; + __le32 pad; + bch_le128 otime; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + +#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 6d3db5cce5f6..d60c7d27a047 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -2,7 +2,6 @@ #include "bcachefs.h" #include "checksum.h" -#include "counters.h" #include "disk_groups.h" #include "ec.h" #include "error.h" @@ -13,6 +12,7 @@ #include "replicas.h" #include "quota.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "sb-members.h" @@ -1321,7 +1321,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "Superblock size:"); prt_tab(out); - prt_printf(out, "%zu", vstruct_bytes(sb)); + prt_units_u64(out, vstruct_bytes(sb)); + prt_str(out, "/"); + prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); prt_newline(out); prt_printf(out, "Clean:"); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 9dbc35940197..b9911402b175 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -23,7 +23,6 @@ #include "checksum.h" #include "clock.h" #include "compress.h" -#include "counters.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" @@ -49,6 +48,7 @@ #include "recovery.h" #include "replicas.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-errors.h" #include "sb-members.h" #include "snapshot.h" @@ -883,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - btree_bytes(c)) || + c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { @@ -1386,8 +1386,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) prt_bdevname(&name, ca->disk_sb.bdev); if (c->sb.nr_devices == 1) - strlcpy(c->name, name.buf, sizeof(c->name)); - strlcpy(ca->name, name.buf, sizeof(ca->name)); + strscpy(c->name, name.buf, sizeof(c->name)); + strscpy(ca->name, name.buf, sizeof(ca->name)); printbuf_exit(&name); @@ -1625,7 +1625,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (data) { struct printbuf data_has = PRINTBUF; - prt_bitflags(&data_has, bch2_data_types, data); + prt_bitflags(&data_has, __bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); printbuf_exit(&data_has); ret = -EBUSY; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 8ed52319ff68..cee80c47feea 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -21,6 +21,7 @@ #include "btree_gc.h" #include "buckets.h" #include "clock.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "inode.h" @@ -247,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) mutex_lock(&c->btree_cache.lock); list_for_each_entry(b, &c->btree_cache.live, list) - ret += btree_bytes(c); + ret += btree_buf_bytes(b); mutex_unlock(&c->btree_cache.lock); return ret; @@ -330,7 +331,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c prt_newline(out); for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { - prt_str(out, bch2_compression_types[i]); + bch2_prt_compression_type(out, i); prt_tab(out); prt_human_readable_u64(out, s[i].sectors_compressed << 9); @@ -725,8 +726,10 @@ STORE(bch2_fs_opts_dir) bch2_opt_set_sb(c, opt, v); bch2_opt_set_by_id(&c->opts, id, v); - if ((id == Opt_background_target || - id == Opt_background_compression) && v) + if (v && + (id == Opt_background_target || + id == Opt_background_compression || + (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); ret = size; @@ -883,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) for (i = 1; i < BCH_DATA_NR; i++) prt_printf(out, "%-12s:%12llu\n", - bch2_data_types[i], + bch2_data_type_str(i), percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); } } @@ -908,7 +911,7 @@ SHOW(bch2_dev) } if (attr == &sysfs_has_data) { - prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); + prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca)); prt_char(out, '\n'); } diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index c94876b3bb06..293b90d704fb 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -46,7 +46,7 @@ DECLARE_EVENT_CLASS(fs_str, __assign_str(str, str); ), - TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) + TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) ); DECLARE_EVENT_CLASS(trans_str, @@ -273,28 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full, TP_ARGS(c) ); -DEFINE_EVENT(bch_fs, journal_entry_full, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(fs_str, journal_entry_full, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(journal_entry_close, - TP_PROTO(struct bch_fs *c, unsigned bytes), - TP_ARGS(c, bytes), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, bytes ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->bytes = bytes; - ), - - TP_printk("%d,%d entry bytes %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->bytes) +DEFINE_EVENT(fs_str, journal_entry_close, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(bio, journal_write, @@ -542,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail, __entry->level = path->level; TRACE_BPOS_assign(pos, path->pos); - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); __entry->self_read_count = c.n[SIX_LOCK_read]; __entry->self_intent_count = c.n[SIX_LOCK_intent]; @@ -827,40 +813,28 @@ TRACE_EVENT(bucket_evacuate, ); DEFINE_EVENT(fs_str, move_extent, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_read, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_write, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_finish, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(move_extent_fail, - TP_PROTO(struct bch_fs *c, const char *msg), - TP_ARGS(c, msg), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __string(msg, msg ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __assign_str(msg, msg); - ), - - TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) +DEFINE_EVENT(fs_str, move_extent_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_start_fail, @@ -1039,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race, __entry->level = b->c.level; __entry->written = b->written; __entry->blocks = btree_blocks(trans->c); - __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); + __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b); ), TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", @@ -1146,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -struct get_locks_fail; - TRACE_EVENT(trans_restart_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1195,11 +1167,9 @@ TRACE_EVENT(trans_restart_upgrade, __entry->node_seq) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) +DEFINE_EVENT(trans_str, trans_restart_relock, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), + TP_ARGS(trans, caller_ip, str) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index c2ef7cddaa4f..a135136adeee 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) +void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits) { while (nr_bits) prt_char(out, '0' + ((v >> --nr_bits) & 1)); } +void bch2_prt_u64_base2(struct printbuf *out, u64 v) +{ + bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); +} + void bch2_print_string_as_lines(const char *prefix, const char *lines) { const char *p; @@ -1186,7 +1191,9 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret) { darray_init(ret); - char *dev_name = kstrdup(_dev_name, GFP_KERNEL), *s = dev_name; + char *dev_name, *s, *orig; + + dev_name = orig = kstrdup(_dev_name, GFP_KERNEL); if (!dev_name) return -ENOMEM; @@ -1201,10 +1208,10 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret) } } - kfree(dev_name); + kfree(orig); return 0; err: bch2_darray_str_exit(ret); - kfree(dev_name); + kfree(orig); return -ENOMEM; } diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index c75fc31915d3..df67bf55fe2b 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -342,7 +342,8 @@ bool bch2_is_zero(const void *, size_t); u64 bch2_read_flag_list(char *, const char * const[]); -void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 5a1858fb9879..9c0d2316031b 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -590,8 +590,9 @@ err: mutex_unlock(&inode->ei_update_lock); if (value && - (opt_id == Opt_background_compression || - opt_id == Opt_background_target)) + (opt_id == Opt_background_target || + opt_id == Opt_background_compression || + (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); return bch2_err_class(ret); diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h new file mode 100644 index 000000000000..e9f810539552 --- /dev/null +++ b/fs/bcachefs/xattr_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_XATTR_FORMAT_H +#define _BCACHEFS_XATTR_FORMAT_H + +#define KEY_TYPE_XATTR_INDEX_USER 0 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 +#define KEY_TYPE_XATTR_INDEX_SECURITY 4 + +struct bch_xattr { + struct bch_val v; + __u8 x_type; + __u8 x_name_len; + __le16 x_val_len; + __u8 x_name[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_XATTR_FORMAT_H */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 193168214eeb..68345f73d429 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + const u8 *data_in, struct page *dest_page, + unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: /* @@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * start_byte tells us the offset into the compressed data we're interested in */ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + unsigned long dest_pgoff, size_t srclen, size_t destlen) { + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); struct list_head *workspace; + const u32 sectorsize = fs_info->sectorsize; int ret; + /* + * The full destination page range should not exceed the page size. + * And the @destlen should not exceed sectorsize, as this is only called for + * inline file extents, which should not exceed sectorsize. + */ + ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + workspace = get_workspace(type, 0); ret = compression_decompress(type, workspace, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); put_workspace(type, workspace); return ret; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 93cc92974dee..afd7e50d073d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); @@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396aba92c57..8e8cc1111277 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 bytes_left, end; u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); - if (WARN_ON(start != aligned_start)) { + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; @@ -4298,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, return 0; } +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotinically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); + } + + return 0; +} + static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4308,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - if (ffe_ctl->for_treelog) { - spin_lock(&fs_info->treelog_bg_lock); - if (fs_info->treelog_bg) - ffe_ctl->hint_byte = fs_info->treelog_bg; - spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { - spin_lock(&fs_info->relocation_bg_lock); - if (fs_info->data_reloc_bg) - ffe_ctl->hint_byte = fs_info->data_reloc_bg; - spin_unlock(&fs_info->relocation_bg_lock); - } - return 0; + return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 809b11472a80..1eb93d3962aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4458,6 +4458,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) u64 root_flags; int ret; + down_write(&fs_info->subvol_sem); + /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit @@ -4469,25 +4471,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - down_write(&fs_info->subvol_sem); - ret = may_destroy_subvol(dest); if (ret) - goto out_up_write; + goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -4497,7 +4499,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) - goto out_up_write; + goto out_undead; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -4563,15 +4565,17 @@ out_end_trans: inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); +out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - } else { + } +out_up_write: + up_write(&fs_info->subvol_sem); + if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 41b479861b3c..dfed9dd9c2d7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EOPNOTSUPP; } + if (btrfs_root_refs(&root->root_item) == 0) + return -ENOENT; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EFAULT; goto out; } + if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { + ret = -EOPNOTSUPP; + goto out; + } /* compression requires us to start the IO */ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1131d5a29d61..e43bc0fdc74e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; - char *kaddr; - unsigned long bytes; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) return -EUCLEAN; @@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } data_in += LZO_LEN; - out_len = PAGE_SIZE; + out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { pr_warn("BTRFS: decompress failed!\n"); @@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, goto out; } - if (out_len < start_byte) { + ASSERT(out_len <= sectorsize); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + /* Early end, considered as an error. */ + if (unlikely(out_len < destlen)) { ret = -EIO; - goto out; + memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); } - - /* - * the caller is already checking against PAGE_SIZE, but lets - * move this check closer to the memcpy/memset - */ - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_local_page(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - - /* - * btrfs_getblock is doing a zero on the tail of the page too, - * but this will cover anything missing from the decompressed - * data. - */ - if (bytes < destlen) - memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 6486f0d7e993..8c4fc98ca9ce 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: - if (ret) + if (ret) { + btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } return ret; } @@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) } } if (ret) { - btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a01807cbd4d4..0123d2728923 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1098,12 +1098,22 @@ out: static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + int num_sectors; + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); - bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1646,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct page *page = scrub_stripe_get_page(stripe, i); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + /* We're beyond the chunk boundary, no need to read anymore. */ + if (i >= nr_sectors) + break; + /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && @@ -1701,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1715,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); - /* Read the whole stripe. */ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; - for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + /* Read the whole range inside the chunk boundary. */ + for (unsigned int cur = 0; cur < nr_sectors; cur++) { + struct page *page = scrub_stripe_get_page(stripe, cur); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); int ret; - ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); /* We should have allocated enough bio vectors. */ - ASSERT(ret == PAGE_SIZE); + ASSERT(ret == fs_info->sectorsize); } atomic_inc(&stripe->pending_io); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e36550618e5..2d7519a6ce72 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8205,8 +8205,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), - arg->clone_sources_count + 1, + sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1, + sizeof(*sctx->clone_roots), GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 93511d54abf8..0e49dab8dad2 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - folio_start_writeback(folio); + if (!folio_test_writeback(folio)) + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 896acfda1789..101f786963d4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1457,6 +1457,14 @@ static int btrfs_reconfigure(struct fs_context *fc) btrfs_info_to_ctx(fs_info, &old_ctx); + /* + * This is our "bind mount" trick, we don't want to allow the user to do + * anything other than mount a different ro/rw and a different subvol, + * all of the mount options should be maintained. + */ + if (mount_reconfigure) + ctx->mount_opt = old_ctx.mount_opt; + sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 50fdc69fdddf..6eccf8496486 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", - ptr, inline_type, end); + ptr, btrfs_extent_inline_ref_size(inline_type), end); return -EUCLEAN; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c32497311d2..d67785be2c77 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); @@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 36cf1f0e338e..8da66ea699e8 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -354,18 +354,13 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - unsigned long bytes_left; - unsigned long total_out = 0; - unsigned long pg_offset = 0; - - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes_left = destlen; + unsigned long to_copy; workspace->strm.next_in = data_in; workspace->strm.avail_in = srclen; @@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, return -EIO; } - while (bytes_left > 0) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - - buf_start = total_out; - total_out = workspace->strm.total_out; - - if (total_out == buf_start) { - ret = -EIO; - break; - } - - if (total_out <= start_byte) - goto next; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - (buf_offset % PAGE_SIZE)); - bytes = min(bytes, bytes_left); + /* + * Everything (in/out buf) should be at most one sector, there should + * be no need to switch any input/output buffer. + */ + ret = zlib_inflate(&workspace->strm, Z_FINISH); + to_copy = min(workspace->strm.total_out, destlen); + if (ret != Z_STREAM_END) + goto out; - memcpy_to_page(dest_page, pg_offset, - workspace->buf + buf_offset, bytes); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); - pg_offset += bytes; - bytes_left -= bytes; -next: - workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = workspace->buf_size; - } - - if (ret != Z_STREAM_END && bytes_left != 0) +out: + if (unlikely(to_copy != destlen)) { + pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", + to_copy, destlen); ret = -EIO; - else + } else { ret = 0; + } zlib_inflateEnd(&workspace->strm); - /* - * this should only happen if zlib returned fewer bytes than we - * expected. btrfs_get_block is responsible for zeroing from the - * end of the inline extent (destlen) to the end of the page - */ - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); - } + if (unlikely(to_copy < destlen)) + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5bd76813b23f..168af9d000d1 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2055,6 +2055,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -2067,7 +2068,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; @@ -2087,20 +2087,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } - spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); @@ -2108,8 +2105,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* For the active block group list */ btrfs_get_block_group(block_group); - - spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -2117,6 +2112,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index 8df715640a48..c5a070550ee3 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -2,7 +2,7 @@ config CACHEFILES tristate "Filesystem caching on files" - depends on FSCACHE && BLOCK + depends on NETFS_SUPPORT && FSCACHE && BLOCK help This permits use of a mounted filesystem as a cache for other filesystems - primarily networking filesystems - thus allowing fast diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 4a87c9d714a9..d33169f0018b 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -246,7 +246,7 @@ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres, enum fscache_want_state want_state); extern int __cachefiles_prepare_write(struct cachefiles_object *object, struct file *file, - loff_t *_start, size_t *_len, + loff_t *_start, size_t *_len, size_t upper_len, bool no_space_allocated_yet); extern int __cachefiles_write(struct cachefiles_object *object, struct file *file, diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 5857241c5918..1d685357e67f 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -517,18 +517,26 @@ cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres, */ int __cachefiles_prepare_write(struct cachefiles_object *object, struct file *file, - loff_t *_start, size_t *_len, + loff_t *_start, size_t *_len, size_t upper_len, bool no_space_allocated_yet) { struct cachefiles_cache *cache = object->volume->cache; loff_t start = *_start, pos; - size_t len = *_len, down; + size_t len = *_len; int ret; /* Round to DIO size */ - down = start - round_down(start, PAGE_SIZE); - *_start = start - down; - *_len = round_up(down + len, PAGE_SIZE); + start = round_down(*_start, PAGE_SIZE); + if (start != *_start || *_len > upper_len) { + /* Probably asked to cache a streaming write written into the + * pagecache when the cookie was temporarily out of service to + * culling. + */ + fscache_count_dio_misfit(); + return -ENOBUFS; + } + + *_len = round_up(len, PAGE_SIZE); /* We need to work out whether there's sufficient disk space to perform * the write - but we can skip that check if we have space already @@ -539,7 +547,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, pos = cachefiles_inject_read_error(); if (pos == 0) - pos = vfs_llseek(file, *_start, SEEK_DATA); + pos = vfs_llseek(file, start, SEEK_DATA); if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { if (pos == -ENXIO) goto check_space; /* Unallocated tail */ @@ -547,7 +555,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, cachefiles_trace_seek_error); return pos; } - if ((u64)pos >= (u64)*_start + *_len) + if ((u64)pos >= (u64)start + *_len) goto check_space; /* Unallocated region */ /* We have a block that's at least partially filled - if we're low on @@ -560,13 +568,13 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, pos = cachefiles_inject_read_error(); if (pos == 0) - pos = vfs_llseek(file, *_start, SEEK_HOLE); + pos = vfs_llseek(file, start, SEEK_HOLE); if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { trace_cachefiles_io_error(object, file_inode(file), pos, cachefiles_trace_seek_error); return pos; } - if ((u64)pos >= (u64)*_start + *_len) + if ((u64)pos >= (u64)start + *_len) return 0; /* Fully allocated */ /* Partially allocated, but insufficient space: cull. */ @@ -574,7 +582,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, ret = cachefiles_inject_remove_error(); if (ret == 0) ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - *_start, *_len); + start, *_len); if (ret < 0) { trace_cachefiles_io_error(object, file_inode(file), ret, cachefiles_trace_fallocate_error); @@ -591,8 +599,8 @@ check_space: } static int cachefiles_prepare_write(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size, - bool no_space_allocated_yet) + loff_t *_start, size_t *_len, size_t upper_len, + loff_t i_size, bool no_space_allocated_yet) { struct cachefiles_object *object = cachefiles_cres_object(cres); struct cachefiles_cache *cache = object->volume->cache; @@ -608,7 +616,7 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres, cachefiles_begin_secure(cache, &saved_cred); ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres), - _start, _len, + _start, _len, upper_len, no_space_allocated_yet); cachefiles_end_secure(cache, saved_cred); return ret; diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index b8fbbb1961bb..4ba42f1fa3b4 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -50,7 +50,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, return -ENOBUFS; cachefiles_begin_secure(cache, &saved_cred); - ret = __cachefiles_prepare_write(object, file, &pos, &len, true); + ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) return ret; @@ -539,6 +539,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) struct fscache_volume *volume = object->volume->vcookie; size_t volume_key_size, cookie_key_size, data_len; + if (!object->ondemand) + return 0; + /* * CacheFiles will firstly check the cache file under the root cache * directory. If the coherency check failed, it will fallback to diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 94df854147d3..7249d70e1a43 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -7,6 +7,7 @@ config CEPH_FS select CRYPTO_AES select CRYPTO select NETFS_SUPPORT + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION default n help Choose Y or M here to include support for mounting the diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 13af429ab030..1340d77124ae 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -159,27 +159,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, ceph_put_snap_context(snapc); } - folio_wait_fscache(folio); -} - -static bool ceph_release_folio(struct folio *folio, gfp_t gfp) -{ - struct inode *inode = folio->mapping->host; - struct ceph_client *cl = ceph_inode_to_client(inode); - - doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode), - folio->index, folio_test_dirty(folio) ? "" : "not "); - - if (folio_test_private(folio)) - return false; - - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - ceph_fscache_note_page_release(inode); - return true; + netfs_invalidate_folio(folio, offset, length); } static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) @@ -357,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) u64 len = subreq->len; bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); u64 off = subreq->start; + int extent_cnt; if (ceph_inode_is_shutdown(inode)) { err = -EIO; @@ -370,8 +351,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, - NULL, ci->i_truncate_seq, ci->i_truncate_size, false); + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; @@ -379,7 +360,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) } if (sparse) { - err = ceph_alloc_sparse_ext_map(&req->r_ops[0]); + extent_cnt = __ceph_sparse_read_ext_count(inode, len); + err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt); if (err) goto out; } @@ -509,7 +491,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, .free_request = ceph_netfs_free_request, - .begin_cache_operation = ceph_begin_cache_operation, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, @@ -1586,7 +1567,7 @@ const struct address_space_operations ceph_aops = { .write_end = ceph_write_end, .dirty_folio = ceph_dirty_folio, .invalidate_folio = ceph_invalidate_folio, - .release_folio = ceph_release_folio, + .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, }; diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index dc502daac49a..20efac020394 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -43,38 +43,19 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to) } } -static inline void ceph_fscache_unpin_writeback(struct inode *inode, +static inline int ceph_fscache_unpin_writeback(struct inode *inode, struct writeback_control *wbc) { - fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode))); + return netfs_unpin_writeback(inode, wbc); } -static inline int ceph_fscache_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - struct ceph_inode_info *ci = ceph_inode(mapping->host); - - return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci)); -} - -static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) -{ - struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); - - return fscache_begin_read_operation(&rreq->cache_resources, cookie); -} +#define ceph_fscache_dirty_folio netfs_dirty_folio static inline bool ceph_is_cache_enabled(struct inode *inode) { return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode))); } -static inline void ceph_fscache_note_page_release(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - - fscache_note_page_release(ceph_fscache_cookie(ci)); -} #else /* CONFIG_CEPH_FSCACHE */ static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) @@ -119,30 +100,18 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to) { } -static inline void ceph_fscache_unpin_writeback(struct inode *inode, - struct writeback_control *wbc) +static inline int ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) { + return 0; } -static inline int ceph_fscache_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - return filemap_dirty_folio(mapping, folio); -} +#define ceph_fscache_dirty_folio filemap_dirty_folio static inline bool ceph_is_cache_enabled(struct inode *inode) { return false; } - -static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) -{ - return -ENOBUFS; -} - -static inline void ceph_fscache_note_page_release(struct inode *inode) -{ -} #endif /* CONFIG_CEPH_FSCACHE */ #endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2c0b8dc3dd0d..9c02f328c966 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4887,13 +4887,15 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) { - struct dentry *parent = NULL; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_client *cl; int force = 0; int ret; + /* This shouldn't happen */ + BUG_ON(!dir); + /* * force an record for the directory caps if we have a dentry lease. * this is racy (can't take i_ceph_lock and d_lock together), but it @@ -4903,14 +4905,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; - if (!dir) { - parent = dget(dentry->d_parent); - dir = d_inode(parent); - } spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); - dput(parent); cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 678596684596..0e9f56eaba1e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1593,10 +1593,12 @@ struct ceph_lease_walk_control { unsigned long dir_lease_ttl; }; +static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *); +static int __dentry_lease_check(const struct dentry *); + static unsigned long __dentry_leases_walk(struct ceph_mds_client *mdsc, - struct ceph_lease_walk_control *lwc, - int (*check)(struct dentry*, void*)) + struct ceph_lease_walk_control *lwc) { struct ceph_dentry_info *di, *tmp; struct dentry *dentry, *last = NULL; @@ -1624,7 +1626,10 @@ __dentry_leases_walk(struct ceph_mds_client *mdsc, goto next; } - ret = check(dentry, lwc); + if (lwc->dir_lease) + ret = __dir_lease_check(dentry, lwc); + else + ret = __dentry_lease_check(dentry); if (ret & TOUCH) { /* move it into tail of dir lease list */ __dentry_dir_lease_touch(mdsc, di); @@ -1681,7 +1686,7 @@ next: return freed; } -static int __dentry_lease_check(struct dentry *dentry, void *arg) +static int __dentry_lease_check(const struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); int ret; @@ -1696,9 +1701,9 @@ static int __dentry_lease_check(struct dentry *dentry, void *arg) return DELETE; } -static int __dir_lease_check(struct dentry *dentry, void *arg) +static int __dir_lease_check(const struct dentry *dentry, + struct ceph_lease_walk_control *lwc) { - struct ceph_lease_walk_control *lwc = arg; struct ceph_dentry_info *di = ceph_dentry(dentry); int ret = __dir_lease_try_check(dentry); @@ -1737,7 +1742,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc) lwc.dir_lease = false; lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2; - freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check); + freed = __dentry_leases_walk(mdsc, &lwc); if (!lwc.nr_to_scan) /* more invalid leases */ return -EAGAIN; @@ -1747,7 +1752,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc) lwc.dir_lease = true; lwc.expire_dir_lease = freed < count; lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ; - freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check); + freed +=__dentry_leases_walk(mdsc, &lwc); if (!lwc.nr_to_scan) /* more to check */ return -EAGAIN; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 726af69d4d62..a79f163ae4ed 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -286,8 +286,6 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); } - if (IS_ERR(inode)) - return ERR_CAST(inode); /* see comments in ceph_get_parent() */ return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d380d9dad0e0..abe8028d95bf 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1029,6 +1029,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, struct ceph_osd_req_op *op; u64 read_off = off; u64 read_len = len; + int extent_cnt; /* determine new offset/length if encrypted */ ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len); @@ -1068,7 +1069,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, op = &req->r_ops[0]; if (sparse) { - ret = ceph_alloc_sparse_ext_map(op); + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { ceph_osdc_put_request(req); break; @@ -1465,6 +1467,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ssize_t len; struct ceph_osd_req_op *op; int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ; + int extent_cnt; if (write) size = min_t(u64, size, fsc->mount_options->wsize); @@ -1528,7 +1531,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); op = &req->r_ops[0]; if (sparse) { - ret = ceph_alloc_sparse_ext_map(op); + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { ceph_osdc_put_request(req); break; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0679240f06db..0c25d326afc4 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -574,7 +574,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) doutc(fsc->client, "%p\n", &ci->netfs.inode); /* Set parameters for the netfs library */ - netfs_inode_init(&ci->netfs, &ceph_netfs_ops); + netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); spin_lock_init(&ci->i_ceph_lock); @@ -694,7 +694,7 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_FSCACHE_WB) + if (inode->i_state & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 02ebfabfc8ee..548d1de379f3 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1534,7 +1534,8 @@ static int encode_metric_spec(void **p, void *end) * session message, specialization for CEPH_SESSION_REQUEST_OPEN * to include additional client metadata fields. */ -static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq) +static struct ceph_msg * +create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; @@ -1578,6 +1579,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 size = METRIC_BYTES(count); extra_bytes += 2 + 4 + 4 + size; + /* flags, mds auth caps and oldest_client_tid */ + extra_bytes += 4 + 4 + 8; + /* Allocate the message */ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); @@ -1589,16 +1593,16 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 end = p + msg->front.iov_len; h = p; - h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN); + h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); /* * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map<string, string> * - * ClientSession messages with metadata are v4 + * ClientSession messages with metadata are v7 */ - msg->hdr.version = cpu_to_le16(4); + msg->hdr.version = cpu_to_le16(7); msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ @@ -1634,6 +1638,15 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 return ERR_PTR(ret); } + /* version == 5, flags */ + ceph_encode_32(&p, 0); + + /* version == 6, mds auth caps */ + ceph_encode_32(&p, 0); + + /* version == 7, oldest_client_tid */ + ceph_encode_64(&p, mdsc->oldest_tid); + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -1663,7 +1676,8 @@ static int __open_session(struct ceph_mds_client *mdsc, session->s_renew_requested = jiffies; /* send connect message */ - msg = create_session_open_msg(mdsc, session->s_seq); + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN, + session->s_seq); if (IS_ERR(msg)) return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); @@ -2028,10 +2042,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, doutc(cl, "to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); - msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); - if (!msg) - return -ENOMEM; + if (IS_ERR(msg)) + return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } @@ -4128,12 +4142,12 @@ static void handle_session(struct ceph_mds_session *session, pr_info_client(cl, "mds%d reconnect success\n", session->s_mds); + session->s_features = features; if (session->s_state == CEPH_MDS_SESSION_OPEN) { pr_notice_client(cl, "mds%d is already opened\n", session->s_mds); } else { session->s_state = CEPH_MDS_SESSION_OPEN; - session->s_features = features; renewed_caps(mdsc, session, 0); if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) @@ -5870,7 +5884,8 @@ static void mds_peer_reset(struct ceph_connection *con) pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n", s->s_mds); - if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO && + ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT) send_mds_reconnect(mdsc, s); } diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 9d36c3532de1..06ee397e0c3a 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -197,10 +197,10 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) } /* - * This function walks through the snaprealm for an inode and returns the - * ceph_snap_realm for the first snaprealm that has quotas set (max_files, + * This function walks through the snaprealm for an inode and set the + * realmp with the first snaprealm that has quotas set (max_files, * max_bytes, or any, depending on the 'which_quota' argument). If the root is - * reached, return the root ceph_snap_realm instead. + * reached, set the realmp with the root ceph_snap_realm instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. @@ -211,10 +211,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) * this function will return -EAGAIN; otherwise, the snaprealms walk-through * will be restarted. */ -static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode, - enum quota_get_realm which_quota, - bool retry) +static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode, + enum quota_get_realm which_quota, + struct ceph_snap_realm **realmp, bool retry) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci = NULL; @@ -222,8 +221,10 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, struct inode *in; bool has_quota; + if (realmp) + *realmp = NULL; if (ceph_snap(inode) != CEPH_NOSNAP) - return NULL; + return 0; restart: realm = ceph_inode(inode)->i_snap_realm; @@ -250,7 +251,7 @@ restart: break; ceph_put_snap_realm(mdsc, realm); if (!retry) - return ERR_PTR(-EAGAIN); + return -EAGAIN; goto restart; } @@ -259,8 +260,11 @@ restart: iput(in); next = realm->parent; - if (has_quota || !next) - return realm; + if (has_quota || !next) { + if (realmp) + *realmp = realm; + return 0; + } ceph_get_snap_realm(mdsc, next); ceph_put_snap_realm(mdsc, realm); @@ -269,7 +273,7 @@ restart: if (realm) ceph_put_snap_realm(mdsc, realm); - return NULL; + return 0; } bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) @@ -277,6 +281,7 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); struct ceph_snap_realm *old_realm, *new_realm; bool is_same; + int ret; restart: /* @@ -286,9 +291,9 @@ restart: * dropped and we can then restart the whole operation. */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); - new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); - if (PTR_ERR(new_realm) == -EAGAIN) { + get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true); + ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false); + if (ret == -EAGAIN) { up_read(&mdsc->snap_rwsem); if (old_realm) ceph_put_snap_realm(mdsc, old_realm); @@ -492,8 +497,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), - QUOTA_GET_MAX_BYTES, true); + get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES, + &realm, true); up_read(&mdsc->snap_rwsem); if (!realm) return false; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fe0f64a0acb2..b06e2bc86221 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -3,6 +3,7 @@ #define _FS_CEPH_SUPER_H #include <linux/ceph/ceph_debug.h> +#include <linux/ceph/osd_client.h> #include <asm/unaligned.h> #include <linux/backing-dev.h> @@ -1407,6 +1408,19 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } +static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len) +{ + int cnt = 0; + + if (IS_ENCRYPTED(inode)) { + cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT; + if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL) + cnt = 0; + } + + return cnt; +} + extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 1d318f85232d..fffd3919343e 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -114,8 +114,11 @@ config EROFS_FS_ZIP_DEFLATE config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support" - depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) - default n + depends on EROFS_FS + select NETFS_SUPPORT + select FSCACHE + select CACHEFILES + select CACHEFILES_ONDEMAND help This permits EROFS to use fscache-backed data blobs with on-demand read support. diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 1d65b9f60a39..072ef6a66823 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -408,7 +408,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) int size, ret = 0; if (!erofs_sb_has_compr_cfgs(sbi)) { - sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4; + sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; return z_erofs_load_lz4_config(sb, dsb, NULL, 0); } diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 87ff35bff8d5..bc12030393b2 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -165,10 +165,10 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; - struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; + struct erofs_fscache *ctx = folio->mapping->host->i_private; struct erofs_fscache_request *req; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); if (IS_ERR(req)) { folio_unlock(folio); @@ -276,7 +276,7 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio) struct erofs_fscache_request *req; int ret; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); if (IS_ERR(req)) { folio_unlock(folio); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 9753875e41cb..e313c936351d 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -454,7 +454,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, .map = map, }; int err = 0; - unsigned int lclusterbits, endoff; + unsigned int lclusterbits, endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; @@ -543,17 +543,20 @@ static int z_erofs_do_map_blocks(struct inode *inode, err = -EFSCORRUPTED; goto unmap_out; } - if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) - map->m_algorithmformat = - Z_EROFS_COMPRESSION_INTERLACED; - else - map->m_algorithmformat = - Z_EROFS_COMPRESSION_SHIFTED; - } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { - map->m_algorithmformat = vi->z_algorithmtype[1]; + afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? + Z_EROFS_COMPRESSION_INTERLACED : + Z_EROFS_COMPRESSION_SHIFTED; } else { - map->m_algorithmformat = vi->z_algorithmtype[0]; + afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? + vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; + if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + afmt, vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } } + map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && diff --git a/fs/exec.c b/fs/exec.c index 73e4045df271..af4fbb61cd53 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -128,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { - .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, @@ -904,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ +/* + * On success, caller must call do_close_execat() on the returned + * struct file to close it. + */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; @@ -948,6 +952,17 @@ exit: return ERR_PTR(err); } +/** + * open_exec - Open a path name for execution + * + * @name: path name to open with the intent of executing it. + * + * Returns ERR_PTR on failure or allocated struct file on success. + * + * As this is a wrapper for the internal do_open_execat(), callers + * must call allow_write_access() before fput() on release. Also see + * do_close_execat(). + */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); @@ -1409,6 +1424,9 @@ int begin_new_exec(struct linux_binprm * bprm) out_unlock: up_write(&me->signal->exec_update_lock); + if (!bprm->cred) + mutex_unlock(&me->signal->cred_guard_mutex); + out: return retval; } @@ -1484,6 +1502,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } +/* Matches do_open_execat() */ +static void do_close_execat(struct file *file) +{ + if (!file) + return; + allow_write_access(file); + fput(file); +} + static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { @@ -1495,10 +1522,7 @@ static void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } + do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ @@ -1508,12 +1532,23 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { - struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + struct linux_binprm *bprm; + struct file *file; int retval = -ENOMEM; - if (!bprm) - goto out; + + file = do_open_execat(fd, filename, flags); + if (IS_ERR(file)) + return ERR_CAST(file); + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) { + do_close_execat(file); + return ERR_PTR(-ENOMEM); + } + + bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; @@ -1526,18 +1561,28 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) if (!bprm->fdpath) goto out_free; + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. This allows the code in exec to + * choose to fail when the executable is not mmaped into the + * interpreter and an open file descriptor is not passed to + * the interpreter. This makes for a better user experience + * than having the interpreter start and then immediately fail + * when it finds the executable is inaccessible. + */ + if (get_close_on_exec(fd)) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); - if (retval) - goto out_free; - return bprm; + if (!retval) + return bprm; out_free: free_bprm(bprm); -out: return ERR_PTR(retval); } @@ -1588,6 +1633,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) } rcu_read_unlock(); + /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else @@ -1804,13 +1850,8 @@ static int exec_binprm(struct linux_binprm *bprm) return 0; } -/* - * sys_execve() executes a new program. - */ -static int bprm_execve(struct linux_binprm *bprm, - int fd, struct filename *filename, int flags) +static int bprm_execve(struct linux_binprm *bprm) { - struct file *file; int retval; retval = prepare_bprm_creds(bprm); @@ -1826,26 +1867,8 @@ static int bprm_execve(struct linux_binprm *bprm, current->in_execve = 1; sched_mm_cid_before_execve(current); - file = do_open_execat(fd, filename, flags); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_unmark; - sched_exec(); - bprm->file = file; - /* - * Record that a name derived from an O_CLOEXEC fd will be - * inaccessible after exec. This allows the code in exec to - * choose to fail when the executable is not mmaped into the - * interpreter and an open file descriptor is not passed to - * the interpreter. This makes for a better user experience - * than having the interpreter start and then immediately fail - * when it finds the executable is inaccessible. - */ - if (bprm->fdpath && get_close_on_exec(fd)) - bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; - /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) @@ -1875,7 +1898,6 @@ out: if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); -out_unmark: sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; @@ -1910,7 +1932,7 @@ static int do_execveat_common(int fd, struct filename *filename, * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -1959,7 +1981,7 @@ static int do_execveat_common(int fd, struct filename *filename, bprm->argc = 1; } - retval = bprm_execve(bprm, fd, filename, flags); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); @@ -1984,7 +2006,7 @@ int kernel_execve(const char *kernel_filename, if (IS_ERR(filename)) return PTR_ERR(filename); - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -2019,7 +2041,7 @@ int kernel_execve(const char *kernel_filename, if (retval < 0) goto out_free; - retval = bprm_execve(bprm, fd, filename, 0); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1767493dffda..3d84fcc471c6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1675,11 +1675,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; - else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) { + else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) { if (!(inode->i_state & I_DIRTY_PAGES)) { - inode->i_state &= ~I_PINNING_FSCACHE_WB; - wbc->unpinned_fscache_wb = true; - dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */ + inode->i_state &= ~I_PINNING_NETFS_WB; + wbc->unpinned_netfs_wb = true; + dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */ } } @@ -1691,7 +1691,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (ret == 0) ret = err; } - wbc->unpinned_fscache_wb = false; + wbc->unpinned_netfs_wb = false; trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig deleted file mode 100644 index b313a978ae0a..000000000000 --- a/fs/fscache/Kconfig +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config FSCACHE - tristate "General filesystem local caching manager" - select NETFS_SUPPORT - help - This option enables a generic filesystem caching manager that can be - used by various network and other filesystems to cache data locally. - Different sorts of caches can be plugged in, depending on the - resources available. - - See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_STATS - bool "Gather statistical information on local caching" - depends on FSCACHE && PROC_FS - select NETFS_STATS - help - This option causes statistical information to be gathered on local - caching and exported through file: - - /proc/fs/fscache/stats - - The gathering of statistics adds a certain amount of overhead to - execution as there are a quite a few stats gathered, and on a - multi-CPU system these may be on cachelines that keep bouncing - between CPUs. On the other hand, the stats are very useful for - debugging purposes. Saying 'Y' here is recommended. - - See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_DEBUG - bool "Debug FS-Cache" - depends on FSCACHE - help - This permits debugging to be dynamically enabled in the local caching - management module. If this is set, the debugging output may be - enabled by setting bits in /sys/modules/fscache/parameter/debug. - - See Documentation/filesystems/caching/fscache.rst for more information. diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile deleted file mode 100644 index afb090ea16c4..000000000000 --- a/fs/fscache/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for general filesystem caching code -# - -fscache-y := \ - cache.o \ - cookie.o \ - io.o \ - main.o \ - volume.o - -fscache-$(CONFIG_PROC_FS) += proc.o -fscache-$(CONFIG_FSCACHE_STATS) += stats.o - -obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h deleted file mode 100644 index 1336f517e9b1..000000000000 --- a/fs/fscache/internal.h +++ /dev/null @@ -1,277 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* Internal definitions for FS-Cache - * - * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) "FS-Cache: " fmt - -#include <linux/slab.h> -#include <linux/fscache-cache.h> -#include <trace/events/fscache.h> -#include <linux/sched.h> -#include <linux/seq_file.h> - -/* - * cache.c - */ -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_caches_seq_ops; -#endif -bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); -void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); -struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); -void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); - -static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) -{ - return smp_load_acquire(&cache->state); -} - -static inline bool fscache_cache_is_live(const struct fscache_cache *cache) -{ - return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; -} - -static inline void fscache_set_cache_state(struct fscache_cache *cache, - enum fscache_cache_state new_state) -{ - smp_store_release(&cache->state, new_state); - -} - -static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, - enum fscache_cache_state old_state, - enum fscache_cache_state new_state) -{ - return try_cmpxchg_release(&cache->state, &old_state, new_state); -} - -/* - * cookie.c - */ -extern struct kmem_cache *fscache_cookie_jar; -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_cookies_seq_ops; -#endif -extern struct timer_list fscache_cookie_lru_timer; - -extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); -extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, - enum fscache_access_trace why); - -static inline void fscache_see_cookie(struct fscache_cookie *cookie, - enum fscache_cookie_trace where) -{ - trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), - where); -} - -/* - * main.c - */ -extern unsigned fscache_debug; - -extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); - -/* - * proc.c - */ -#ifdef CONFIG_PROC_FS -extern int __init fscache_proc_init(void); -extern void fscache_proc_cleanup(void); -#else -#define fscache_proc_init() (0) -#define fscache_proc_cleanup() do {} while (0) -#endif - -/* - * stats.c - */ -#ifdef CONFIG_FSCACHE_STATS -extern atomic_t fscache_n_volumes; -extern atomic_t fscache_n_volumes_collision; -extern atomic_t fscache_n_volumes_nomem; -extern atomic_t fscache_n_cookies; -extern atomic_t fscache_n_cookies_lru; -extern atomic_t fscache_n_cookies_lru_expired; -extern atomic_t fscache_n_cookies_lru_removed; -extern atomic_t fscache_n_cookies_lru_dropped; - -extern atomic_t fscache_n_acquires; -extern atomic_t fscache_n_acquires_ok; -extern atomic_t fscache_n_acquires_oom; - -extern atomic_t fscache_n_invalidates; - -extern atomic_t fscache_n_relinquishes; -extern atomic_t fscache_n_relinquishes_retire; -extern atomic_t fscache_n_relinquishes_dropped; - -extern atomic_t fscache_n_resizes; -extern atomic_t fscache_n_resizes_null; - -static inline void fscache_stat(atomic_t *stat) -{ - atomic_inc(stat); -} - -static inline void fscache_stat_d(atomic_t *stat) -{ - atomic_dec(stat); -} - -#define __fscache_stat(stat) (stat) - -int fscache_stats_show(struct seq_file *m, void *v); -#else - -#define __fscache_stat(stat) (NULL) -#define fscache_stat(stat) do {} while (0) -#define fscache_stat_d(stat) do {} while (0) -#endif - -/* - * volume.c - */ -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_volumes_seq_ops; -#endif - -struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); -void fscache_put_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); -bool fscache_begin_volume_access(struct fscache_volume *volume, - struct fscache_cookie *cookie, - enum fscache_access_trace why); -void fscache_create_volume(struct fscache_volume *volume, bool wait); - - -/*****************************************************************************/ -/* - * debug tracing - */ -#define dbgprintk(FMT, ...) \ - printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) - -#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) - -#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__) - -#ifdef __KDEBUG -#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) -#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) -#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) - -#elif defined(CONFIG_FSCACHE_DEBUG) -#define _enter(FMT, ...) \ -do { \ - if (__do_kdebug(ENTER)) \ - kenter(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _leave(FMT, ...) \ -do { \ - if (__do_kdebug(LEAVE)) \ - kleave(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _debug(FMT, ...) \ -do { \ - if (__do_kdebug(DEBUG)) \ - kdebug(FMT, ##__VA_ARGS__); \ -} while (0) - -#else -#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) -#endif - -/* - * determine whether a particular optional debugging point should be logged - * - we need to go through three steps to persuade cpp to correctly join the - * shorthand in FSCACHE_DEBUG_LEVEL with its prefix - */ -#define ____do_kdebug(LEVEL, POINT) \ - unlikely((fscache_debug & \ - (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3)))) -#define ___do_kdebug(LEVEL, POINT) \ - ____do_kdebug(LEVEL, POINT) -#define __do_kdebug(POINT) \ - ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT) - -#define FSCACHE_DEBUG_CACHE 0 -#define FSCACHE_DEBUG_COOKIE 1 -#define FSCACHE_DEBUG_OBJECT 2 -#define FSCACHE_DEBUG_OPERATION 3 - -#define FSCACHE_POINT_ENTER 1 -#define FSCACHE_POINT_LEAVE 2 -#define FSCACHE_POINT_DEBUG 4 - -#ifndef FSCACHE_DEBUG_LEVEL -#define FSCACHE_DEBUG_LEVEL CACHE -#endif - -/* - * assertions - */ -#if 1 /* defined(__KDEBUGALL) */ - -#define ASSERT(X) \ -do { \ - if (unlikely(!(X))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTCMP(X, OP, Y) \ -do { \ - if (unlikely(!((X) OP (Y)))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - pr_err("%lx " #OP " %lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTIF(C, X) \ -do { \ - if (unlikely((C) && !(X))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTIFCMP(C, X, OP, Y) \ -do { \ - if (unlikely((C) && !((X) OP (Y)))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - pr_err("%lx " #OP " %lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - BUG(); \ - } \ -} while (0) - -#else - -#define ASSERT(X) do {} while (0) -#define ASSERTCMP(X, OP, Y) do {} while (0) -#define ASSERTIF(C, X) do {} while (0) -#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) - -#endif /* assert or not */ diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index b4db21022cb4..bec805e0c44c 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -21,3 +21,42 @@ config NETFS_STATS multi-CPU system these may be on cachelines that keep bouncing between CPUs. On the other hand, the stats are very useful for debugging purposes. Saying 'Y' here is recommended. + +config FSCACHE + bool "General filesystem local caching manager" + depends on NETFS_SUPPORT + help + This option enables a generic filesystem caching manager that can be + used by various network and other filesystems to cache data locally. + Different sorts of caches can be plugged in, depending on the + resources available. + + See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_STATS + bool "Gather statistical information on local caching" + depends on FSCACHE && PROC_FS + select NETFS_STATS + help + This option causes statistical information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/stats + + The gathering of statistics adds a certain amount of overhead to + execution as there are a quite a few stats gathered, and on a + multi-CPU system these may be on cachelines that keep bouncing + between CPUs. On the other hand, the stats are very useful for + debugging purposes. Saying 'Y' here is recommended. + + See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_DEBUG + bool "Debug FS-Cache" + depends on FSCACHE + help + This permits debugging to be dynamically enabled in the local caching + management module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/fscache/parameter/debug. + + See Documentation/filesystems/caching/fscache.rst for more information. diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index 386d6fb92793..d4d1d799819e 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -2,11 +2,29 @@ netfs-y := \ buffered_read.o \ + buffered_write.o \ + direct_read.o \ + direct_write.o \ io.o \ iterator.o \ + locking.o \ main.o \ - objects.o + misc.o \ + objects.o \ + output.o netfs-$(CONFIG_NETFS_STATS) += stats.o -obj-$(CONFIG_NETFS_SUPPORT) := netfs.o +netfs-$(CONFIG_FSCACHE) += \ + fscache_cache.o \ + fscache_cookie.o \ + fscache_io.o \ + fscache_main.o \ + fscache_volume.o + +ifeq ($(CONFIG_PROC_FS),y) +netfs-$(CONFIG_FSCACHE) += fscache_proc.o +endif +netfs-$(CONFIG_FSCACHE_STATS) += fscache_stats.o + +obj-$(CONFIG_NETFS_SUPPORT) += netfs.o diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 2cd3ccf4c439..3298c29b5548 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -16,6 +16,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; + struct netfs_folio *finfo; struct folio *folio; pgoff_t start_page = rreq->start / PAGE_SIZE; pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; @@ -63,6 +64,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) break; } if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); folio_start_fscache(folio); folio_started = true; } @@ -86,11 +88,20 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (!pg_failed) { flush_dcache_folio(folio); + finfo = netfs_folio_info(folio); + if (finfo) { + trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + kfree(finfo); + } folio_mark_uptodate(folio); } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio_index(folio) == rreq->no_unlock_folio && + if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) _debug("no unlock"); else @@ -147,6 +158,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq, } } +/* + * Begin an operation, and fetch the stored zero point value from the cookie if + * available. + */ +static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx) +{ + return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); +} + /** * netfs_readahead - Helper to manage a read request * @ractl: The description of the readahead request @@ -180,11 +200,9 @@ void netfs_readahead(struct readahead_control *ractl) if (IS_ERR(rreq)) return; - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto cleanup_free; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto cleanup_free; netfs_stat(&netfs_n_rh_readahead); trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), @@ -192,6 +210,10 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages, + rreq->start, rreq->len); + /* Drop the refs on the folios here rather than in the cache or * filesystem. The locks will be dropped in netfs_rreq_unlock(). */ @@ -199,6 +221,7 @@ void netfs_readahead(struct readahead_control *ractl) ; netfs_begin_read(rreq, false); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); return; cleanup_free: @@ -223,12 +246,13 @@ EXPORT_SYMBOL(netfs_readahead); */ int netfs_read_folio(struct file *file, struct folio *folio) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct netfs_io_request *rreq; struct netfs_inode *ctx = netfs_inode(mapping->host); + struct folio *sink = NULL; int ret; - _enter("%lx", folio_index(folio)); + _enter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_file_pos(folio), folio_size(folio), @@ -238,15 +262,64 @@ int netfs_read_folio(struct file *file, struct folio *folio) goto alloc_error; } - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto discard; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto discard; netfs_stat(&netfs_n_rh_readpage); trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); - return netfs_begin_read(rreq, true); + + /* Set up the output buffer */ + if (folio_test_dirty(folio)) { + /* Handle someone trying to read from an unflushed streaming + * write. We fiddle the buffer so that a gap at the beginning + * and/or a gap at the end get copied to, but the middle is + * discarded. + */ + struct netfs_folio *finfo = netfs_folio_info(folio); + struct bio_vec *bvec; + unsigned int from = finfo->dirty_offset; + unsigned int to = from + finfo->dirty_len; + unsigned int off = 0, i = 0; + size_t flen = folio_size(folio); + size_t nr_bvec = flen / PAGE_SIZE + 2; + size_t part; + + ret = -ENOMEM; + bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + goto discard; + + sink = folio_alloc(GFP_KERNEL, 0); + if (!sink) + goto discard; + + trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + + rreq->direct_bv = bvec; + rreq->direct_bv_count = nr_bvec; + if (from > 0) { + bvec_set_folio(&bvec[i++], folio, from, 0); + off = from; + } + while (off < to) { + part = min_t(size_t, to - off, PAGE_SIZE); + bvec_set_folio(&bvec[i++], sink, part, 0); + off += part; + } + if (to < flen) + bvec_set_folio(&bvec[i++], folio, flen - to, to); + iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); + } else { + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + } + + ret = netfs_begin_read(rreq, true); + if (sink) + folio_put(sink); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret < 0 ? ret : 0; discard: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); @@ -387,14 +460,12 @@ retry: ret = PTR_ERR(rreq); goto error; } - rreq->no_unlock_folio = folio_index(folio); + rreq->no_unlock_folio = folio->index; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto error_put; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; netfs_stat(&netfs_n_rh_write_begin); trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); @@ -405,6 +476,10 @@ retry: ractl._nr_pages = folio_nr_pages(folio); netfs_rreq_expand(rreq, &ractl); + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + /* We hold the folio locks, so we can drop the references */ folio_get(folio); while (readahead_folio(&ractl)) @@ -413,6 +488,7 @@ retry: ret = netfs_begin_read(rreq, true); if (ret < 0) goto error; + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: ret = folio_wait_fscache_killable(folio); @@ -434,3 +510,124 @@ error: return ret; } EXPORT_SYMBOL(netfs_write_begin); + +/* + * Preload the data into a page we're proposing to write into. + */ +int netfs_prefetch_for_write(struct file *file, struct folio *folio, + size_t offset, size_t len) +{ + struct netfs_io_request *rreq; + struct address_space *mapping = folio->mapping; + struct netfs_inode *ctx = netfs_inode(mapping->host); + unsigned long long start = folio_pos(folio); + size_t flen = folio_size(folio); + int ret; + + _enter("%zx @%llx", flen, start); + + ret = -ENOMEM; + + rreq = netfs_alloc_request(mapping, file, start, flen, + NETFS_READ_FOR_WRITE); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto error; + } + + rreq->no_unlock_folio = folio->index; + __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; + + netfs_stat(&netfs_n_rh_write_begin); + trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); + + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + + ret = netfs_begin_read(rreq, true); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret; + +error_put: + netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); +error: + _leave(" = %d", ret); + return ret; +} + +/** + * netfs_buffered_read_iter - Filesystem buffered I/O read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the ->read_iter() routine for all filesystems that can use the page + * cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be + * returned when no data can be read without waiting for I/O requests to + * complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests + * shall be made for the read or for readahead. When no data can be read, + * -EAGAIN shall be returned. When readahead would be triggered, a partial, + * possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ +ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))) + return -EINVAL; + + ret = netfs_start_io_read(inode); + if (ret == 0) { + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + } + return ret; +} +EXPORT_SYMBOL(netfs_buffered_read_iter); + +/** + * netfs_file_read_iter - Generic filesystem read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the ->read_iter() routine for all filesystems that can use the page + * cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be + * returned when no data can be read without waiting for I/O requests to + * complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests + * shall be made for the read or for readahead. When no data can be read, + * -EAGAIN shall be returned. When readahead would be triggered, a partial, + * possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ +ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host); + + if ((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) + return netfs_unbuffered_read_iter(iocb, iter); + + return netfs_buffered_read_iter(iocb, iter); +} +EXPORT_SYMBOL(netfs_file_read_iter); diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c new file mode 100644 index 000000000000..a3059b3168fd --- /dev/null +++ b/fs/netfs/buffered_write.c @@ -0,0 +1,1254 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/pagevec.h> +#include "internal.h" + +/* + * Determined write method. Adjust netfs_folio_traces if this is changed. + */ +enum netfs_how_to_modify { + NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ + NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ + NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ + NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ + NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ + NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ + NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ +}; + +static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq); + +static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) +{ + if (netfs_group && !folio_get_private(folio)) + folio_attach_private(folio, netfs_get_group(netfs_group)); +} + +#if IS_ENABLED(CONFIG_FSCACHE) +static void netfs_folio_start_fscache(bool caching, struct folio *folio) +{ + if (caching) + folio_start_fscache(folio); +} +#else +static void netfs_folio_start_fscache(bool caching, struct folio *folio) +{ +} +#endif + +/* + * Decide how we should modify a folio. We might be attempting to do + * write-streaming, in which case we don't want to a local RMW cycle if we can + * avoid it. If we're doing local caching or content crypto, we award that + * priority over avoiding RMW. If the file is open readably, then we also + * assume that we may want to read what we wrote. + */ +static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, + struct file *file, + struct folio *folio, + void *netfs_group, + size_t flen, + size_t offset, + size_t len, + bool maybe_trouble) +{ + struct netfs_folio *finfo = netfs_folio_info(folio); + loff_t pos = folio_file_pos(folio); + + _enter(""); + + if (netfs_folio_group(folio) != netfs_group) + return NETFS_FLUSH_CONTENT; + + if (folio_test_uptodate(folio)) + return NETFS_FOLIO_IS_UPTODATE; + + if (pos >= ctx->zero_point) + return NETFS_MODIFY_AND_CLEAR; + + if (!maybe_trouble && offset == 0 && len >= flen) + return NETFS_WHOLE_FOLIO_MODIFY; + + if (file->f_mode & FMODE_READ) + goto no_write_streaming; + if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) + goto no_write_streaming; + + if (netfs_is_cache_enabled(ctx)) { + /* We don't want to get a streaming write on a file that loses + * caching service temporarily because the backing store got + * culled. + */ + if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) + set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags); + goto no_write_streaming; + } + + if (!finfo) + return NETFS_STREAMING_WRITE; + + /* We can continue a streaming write only if it continues on from the + * previous. If it overlaps, we must flush lest we suffer a partial + * copy and disjoint dirty regions. + */ + if (offset == finfo->dirty_offset + finfo->dirty_len) + return NETFS_STREAMING_WRITE_CONT; + return NETFS_FLUSH_CONTENT; + +no_write_streaming: + if (finfo) { + netfs_stat(&netfs_n_wh_wstream_conflict); + return NETFS_FLUSH_CONTENT; + } + return NETFS_JUST_PREFETCH; +} + +/* + * Grab a folio for writing and lock it. Attempt to allocate as large a folio + * as possible to hold as much of the remaining length as possible in one go. + */ +static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, + loff_t pos, size_t part) +{ + pgoff_t index = pos / PAGE_SIZE; + fgf_t fgp_flags = FGP_WRITEBEGIN; + + if (mapping_large_folio_support(mapping)) + fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); + + return __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} + +/** + * netfs_perform_write - Copy data into the pagecache. + * @iocb: The operation parameters + * @iter: The source buffer + * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * + * Copy data into pagecache pages attached to the inode specified by @iocb. + * The caller must hold appropriate inode locks. + * + * Dirty pages are tagged with a netfs_folio struct if they're not up to date + * to indicate the range modified. Dirty pages may also be tagged with a + * netfs-specific grouping such that data from an old group gets flushed before + * a new one is started. + */ +ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct netfs_inode *ctx = netfs_inode(inode); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .for_sync = true, + .nr_to_write = LONG_MAX, + .range_start = iocb->ki_pos, + .range_end = iocb->ki_pos + iter->count, + }; + struct netfs_io_request *wreq = NULL; + struct netfs_folio *finfo; + struct folio *folio; + enum netfs_how_to_modify howto; + enum netfs_folio_trace trace; + unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; + ssize_t written = 0, ret; + loff_t i_size, pos = iocb->ki_pos, from, to; + size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; + bool maybe_trouble = false; + + if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || + iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) + ) { + if (pos < i_size_read(inode)) { + ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); + if (ret < 0) { + goto out; + } + } + + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + + wreq = netfs_begin_writethrough(iocb, iter->count); + if (IS_ERR(wreq)) { + wbc_detach_inode(&wbc); + ret = PTR_ERR(wreq); + wreq = NULL; + goto out; + } + if (!is_sync_kiocb(iocb)) + wreq->iocb = iocb; + wreq->cleanup = netfs_cleanup_buffered_write; + } + + do { + size_t flen; + size_t offset; /* Offset into pagecache folio */ + size_t part; /* Bytes to write to folio */ + size_t copied; /* Bytes copied from user */ + + ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); + if (unlikely(ret < 0)) + break; + + offset = pos & (max_chunk - 1); + part = min(max_chunk - offset, iov_iter_count(iter)); + + /* Bring in the user pages that we will copy from _first_ lest + * we hit a nasty deadlock on copying from the same page as + * we're writing to, without it being marked uptodate. + * + * Not only is this an optimisation, but it is also required to + * check that the address is actually valid, when atomic + * usercopies are used below. + * + * We rely on the page being held onto long enough by the LRU + * that we can grab it below if this causes it to be read. + */ + ret = -EFAULT; + if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) + break; + + folio = netfs_grab_folio_for_write(mapping, pos, part); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + break; + } + + flen = folio_size(folio); + offset = pos & (flen - 1); + part = min_t(size_t, flen - offset, part); + + if (signal_pending(current)) { + ret = written ? -EINTR : -ERESTARTSYS; + goto error_folio_unlock; + } + + /* See if we need to prefetch the area we're going to modify. + * We need to do this before we get a lock on the folio in case + * there's more than one writer competing for the same cache + * block. + */ + howto = netfs_how_to_modify(ctx, file, folio, netfs_group, + flen, offset, part, maybe_trouble); + _debug("howto %u", howto); + switch (howto) { + case NETFS_JUST_PREFETCH: + ret = netfs_prefetch_for_write(file, folio, offset, part); + if (ret < 0) { + _debug("prefetch = %zd", ret); + goto error_folio_unlock; + } + break; + case NETFS_FOLIO_IS_UPTODATE: + case NETFS_WHOLE_FOLIO_MODIFY: + case NETFS_STREAMING_WRITE_CONT: + break; + case NETFS_MODIFY_AND_CLEAR: + zero_user_segment(&folio->page, 0, offset); + break; + case NETFS_STREAMING_WRITE: + ret = -EIO; + if (WARN_ON(folio_get_private(folio))) + goto error_folio_unlock; + break; + case NETFS_FLUSH_CONTENT: + trace_netfs_folio(folio, netfs_flush_content); + from = folio_pos(folio); + to = from + folio_size(folio) - 1; + folio_unlock(folio); + folio_put(folio); + ret = filemap_write_and_wait_range(mapping, from, to); + if (ret < 0) + goto error_folio_unlock; + continue; + } + + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + + flush_dcache_folio(folio); + + /* Deal with a (partially) failed copy */ + if (copied == 0) { + ret = -EFAULT; + goto error_folio_unlock; + } + + trace = (enum netfs_folio_trace)howto; + switch (howto) { + case NETFS_FOLIO_IS_UPTODATE: + case NETFS_JUST_PREFETCH: + netfs_set_group(folio, netfs_group); + break; + case NETFS_MODIFY_AND_CLEAR: + zero_user_segment(&folio->page, offset + copied, flen); + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + break; + case NETFS_WHOLE_FOLIO_MODIFY: + if (unlikely(copied < part)) { + maybe_trouble = true; + iov_iter_revert(iter, copied); + copied = 0; + goto retry; + } + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + break; + case NETFS_STREAMING_WRITE: + if (offset == 0 && copied == flen) { + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + trace = netfs_streaming_filled_page; + break; + } + finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); + if (!finfo) { + iov_iter_revert(iter, copied); + ret = -ENOMEM; + goto error_folio_unlock; + } + finfo->netfs_group = netfs_get_group(netfs_group); + finfo->dirty_offset = offset; + finfo->dirty_len = copied; + folio_attach_private(folio, (void *)((unsigned long)finfo | + NETFS_FOLIO_INFO)); + break; + case NETFS_STREAMING_WRITE_CONT: + finfo = netfs_folio_info(folio); + finfo->dirty_len += copied; + if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + folio_mark_uptodate(folio); + kfree(finfo); + trace = netfs_streaming_cont_filled_page; + } + break; + default: + WARN(true, "Unexpected modify type %u ix=%lx\n", + howto, folio->index); + ret = -EIO; + goto error_folio_unlock; + } + + trace_netfs_folio(folio, trace); + + /* Update the inode size if we moved the EOF marker */ + i_size = i_size_read(inode); + pos += copied; + if (pos > i_size) { + if (ctx->ops->update_i_size) { + ctx->ops->update_i_size(inode, pos); + } else { + i_size_write(inode, pos); +#if IS_ENABLED(CONFIG_FSCACHE) + fscache_update_cookie(ctx->cache, NULL, &pos); +#endif + } + } + written += copied; + + if (likely(!wreq)) { + folio_mark_dirty(folio); + } else { + if (folio_test_dirty(folio)) + /* Sigh. mmap. */ + folio_clear_dirty_for_io(folio); + /* We make multiple writes to the folio... */ + if (!folio_test_writeback(folio)) { + folio_wait_fscache(folio); + folio_start_writeback(folio); + folio_start_fscache(folio); + if (wreq->iter.count == 0) + trace_netfs_folio(folio, netfs_folio_trace_wthru); + else + trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); + } + netfs_advance_writethrough(wreq, copied, + offset + copied == flen); + } + retry: + folio_unlock(folio); + folio_put(folio); + folio = NULL; + + cond_resched(); + } while (iov_iter_count(iter)); + +out: + if (unlikely(wreq)) { + ret = netfs_end_writethrough(wreq, iocb); + wbc_detach_inode(&wbc); + if (ret == -EIOCBQUEUED) + return ret; + } + + iocb->ki_pos += written; + _leave(" = %zd [%zd]", written, ret); + return written ? written : ret; + +error_folio_unlock: + folio_unlock(folio); + folio_put(folio); + goto out; +} +EXPORT_SYMBOL(netfs_perform_write); + +/** + * netfs_buffered_write_iter_locked - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @from: iov_iter with data to write + * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * The caller must hold appropriate locks around this function and have called + * generic_write_checks() already. The caller is also responsible for doing + * any necessary syncing afterwards. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_rwsem. + * + * Return: + * * number of bytes written, even for truncated writes + * * negative error code if no data has been written at all + */ +ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, + struct netfs_group *netfs_group) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + trace_netfs_write_iter(iocb, from); + + ret = file_remove_privs(file); + if (ret) + return ret; + + ret = file_update_time(file); + if (ret) + return ret; + + return netfs_perform_write(iocb, from, netfs_group); +} +EXPORT_SYMBOL(netfs_buffered_write_iter_locked); + +/** + * netfs_file_write_iter - write data to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Perform a write to a file, writing into the pagecache if possible and doing + * an unbuffered write instead if not. + * + * Return: + * * Negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * Number of bytes written, even for truncated writes + */ +ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + + if ((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) + return netfs_unbuffered_write_iter(iocb, from); + + ret = netfs_start_io_write(inode); + if (ret < 0) + return ret; + + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = netfs_buffered_write_iter_locked(iocb, from, NULL); + netfs_end_io_write(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +EXPORT_SYMBOL(netfs_file_write_iter); + +/* + * Notification that a previously read-only page is about to become writable. + * Note that the caller indicates a single page of a multipage folio. + */ +vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) +{ + struct folio *folio = page_folio(vmf->page); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + vm_fault_t ret = VM_FAULT_RETRY; + int err; + + _enter("%lx", folio->index); + + sb_start_pagefault(inode->i_sb); + + if (folio_wait_writeback_killable(folio)) + goto out; + + if (folio_lock_killable(folio) < 0) + goto out; + + /* Can we see a streaming write here? */ + if (WARN_ON(!folio_test_uptodate(folio))) { + ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; + goto out; + } + + if (netfs_folio_group(folio) != netfs_group) { + folio_unlock(folio); + err = filemap_fdatawait_range(inode->i_mapping, + folio_pos(folio), + folio_pos(folio) + folio_size(folio)); + switch (err) { + case 0: + ret = VM_FAULT_RETRY; + goto out; + case -ENOMEM: + ret = VM_FAULT_OOM; + goto out; + default: + ret = VM_FAULT_SIGBUS; + goto out; + } + } + + if (folio_test_dirty(folio)) + trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); + else + trace_netfs_folio(folio, netfs_folio_trace_mkwrite); + netfs_set_group(folio, netfs_group); + file_update_time(file); + ret = VM_FAULT_LOCKED; +out: + sb_end_pagefault(inode->i_sb); + return ret; +} +EXPORT_SYMBOL(netfs_page_mkwrite); + +/* + * Kill all the pages in the given range + */ +static void netfs_kill_pages(struct address_space *mapping, + loff_t start, loff_t len) +{ + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; + + _enter("%llx-%llx", start, start + len - 1); + + do { + _debug("kill %lx (to %lx)", index, last); + + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) { + next = index + 1; + continue; + } + + next = folio_next_index(folio); + + trace_netfs_folio(folio, netfs_folio_trace_kill); + folio_clear_uptodate(folio); + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + folio_end_writeback(folio); + folio_lock(folio); + generic_error_remove_folio(mapping, folio); + folio_unlock(folio); + folio_put(folio); + + } while (index = next, index <= last); + + _leave(""); +} + +/* + * Redirty all the pages in a given range. + */ +static void netfs_redirty_pages(struct address_space *mapping, + loff_t start, loff_t len) +{ + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; + + _enter("%llx-%llx", start, start + len - 1); + + do { + _debug("redirty %llx @%llx", len, start); + + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) { + next = index + 1; + continue; + } + + next = folio_next_index(folio); + trace_netfs_folio(folio, netfs_folio_trace_redirty); + filemap_dirty_folio(mapping, folio); + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + folio_end_writeback(folio); + folio_put(folio); + } while (index = next, index <= last); + + balance_dirty_pages_ratelimited(mapping); + + _leave(""); +} + +/* + * Completion of write to server + */ +static void netfs_pages_written_back(struct netfs_io_request *wreq) +{ + struct address_space *mapping = wreq->mapping; + struct netfs_folio *finfo; + struct netfs_group *group = NULL; + struct folio *folio; + pgoff_t last; + int gcount = 0; + + XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE); + + _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); + + rcu_read_lock(); + + last = (wreq->start + wreq->len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, last) { + WARN(!folio_test_writeback(folio), + "bad %zx @%llx page %lx %lx\n", + wreq->len, wreq->start, folio->index, last); + + if ((finfo = netfs_folio_info(folio))) { + /* Streaming writes cannot be redirtied whilst under + * writeback, so discard the streaming record. + */ + folio_detach_private(folio); + group = finfo->netfs_group; + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_s); + kfree(finfo); + } else if ((group = netfs_folio_group(folio))) { + /* Need to detach the group pointer if the page didn't + * get redirtied. If it has been redirtied, then it + * must be within the same group. + */ + if (folio_test_dirty(folio)) { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + goto end_wb; + } + if (folio_trylock(folio)) { + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_g); + } else { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + } + folio_unlock(folio); + goto end_wb; + } + + xas_pause(&xas); + rcu_read_unlock(); + folio_lock(folio); + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_g); + } else { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + } + folio_unlock(folio); + rcu_read_lock(); + } else { + trace_netfs_folio(folio, netfs_folio_trace_clear); + } + end_wb: + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + xas_advance(&xas, folio_next_index(folio) - 1); + folio_end_writeback(folio); + } + + rcu_read_unlock(); + netfs_put_group_many(group, gcount); + _leave(""); +} + +/* + * Deal with the disposition of the folios that are under writeback to close + * out the operation. + */ +static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq) +{ + struct address_space *mapping = wreq->mapping; + + _enter(""); + + switch (wreq->error) { + case 0: + netfs_pages_written_back(wreq); + break; + + default: + pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error); + fallthrough; + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + case -ENETRESET: + case -EDQUOT: + case -ENOSPC: + netfs_redirty_pages(mapping, wreq->start, wreq->len); + break; + + case -EROFS: + case -EIO: + case -EREMOTEIO: + case -EFBIG: + case -ENOENT: + case -ENOMEDIUM: + case -ENXIO: + netfs_kill_pages(mapping, wreq->start, wreq->len); + break; + } + + if (wreq->error) + mapping_set_error(mapping, wreq->error); + if (wreq->netfs_ops->done) + wreq->netfs_ops->done(wreq); +} + +/* + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + * + * If this page holds new content, then we can include filler zeros in the + * writeback. + */ +static void netfs_extend_writeback(struct address_space *mapping, + struct netfs_group *group, + struct xa_state *xas, + long *_count, + loff_t start, + loff_t max_len, + bool caching, + size_t *_len, + size_t *_top) +{ + struct netfs_folio *finfo; + struct folio_batch fbatch; + struct folio *folio; + unsigned int i; + pgoff_t index = (start + *_len) / PAGE_SIZE; + size_t len; + void *priv; + bool stop = true; + + folio_batch_init(&fbatch); + + do { + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. + */ + rcu_read_lock(); + + xas_for_each(xas, folio, ULONG_MAX) { + stop = true; + if (xas_retry(xas, folio)) + continue; + if (xa_is_value(folio)) + break; + if (folio->index != index) { + xas_reset(xas); + break; + } + + if (!folio_try_get_rcu(folio)) { + xas_reset(xas); + continue; + } + + /* Has the folio moved or been split? */ + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); + xas_reset(xas); + break; + } + + if (!folio_trylock(folio)) { + folio_put(folio); + xas_reset(xas); + break; + } + if (!folio_test_dirty(folio) || + folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + folio_put(folio); + xas_reset(xas); + break; + } + + stop = false; + len = folio_size(folio); + priv = folio_get_private(folio); + if ((const struct netfs_group *)priv != group) { + stop = true; + finfo = netfs_folio_info(folio); + if (finfo->netfs_group != group || + finfo->dirty_offset > 0) { + folio_unlock(folio); + folio_put(folio); + xas_reset(xas); + break; + } + len = finfo->dirty_len; + } + + *_top += folio_size(folio); + index += folio_nr_pages(folio); + *_count -= folio_nr_pages(folio); + *_len += len; + if (*_len >= max_len || *_count <= 0) + stop = true; + + if (!folio_batch_add(&fbatch, folio)) + break; + if (stop) + break; + } + + xas_pause(xas); + rcu_read_unlock(); + + /* Now, if we obtained any folios, we can shift them to being + * writable and mark them for caching. + */ + if (!folio_batch_count(&fbatch)) + break; + + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; + trace_netfs_folio(folio, netfs_folio_trace_store_plus); + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + folio_start_writeback(folio); + netfs_folio_start_fscache(caching, folio); + folio_unlock(folio); + } + + folio_batch_release(&fbatch); + cond_resched(); + } while (!stop); +} + +/* + * Synchronously write back the locked page and any subsequent non-locked dirty + * pages. + */ +static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + struct xa_state *xas, + struct folio *folio, + unsigned long long start, + unsigned long long end) +{ + struct netfs_io_request *wreq; + struct netfs_folio *finfo; + struct netfs_inode *ctx = netfs_inode(mapping->host); + unsigned long long i_size = i_size_read(&ctx->inode); + size_t len, max_len; + bool caching = netfs_is_cache_enabled(ctx); + long count = wbc->nr_to_write; + int ret; + + _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching); + + wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), + NETFS_WRITEBACK); + if (IS_ERR(wreq)) { + folio_unlock(folio); + return PTR_ERR(wreq); + } + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + folio_start_writeback(folio); + netfs_folio_start_fscache(caching, folio); + + count -= folio_nr_pages(folio); + + /* Find all consecutive lockable dirty pages that have contiguous + * written regions, stopping when we find a page that is not + * immediately lockable, is not dirty or is missing, or we reach the + * end of the range. + */ + trace_netfs_folio(folio, netfs_folio_trace_store); + + len = wreq->len; + finfo = netfs_folio_info(folio); + if (finfo) { + start += finfo->dirty_offset; + if (finfo->dirty_offset + finfo->dirty_len != len) { + len = finfo->dirty_len; + goto cant_expand; + } + len = finfo->dirty_len; + } + + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = 65536 * 4096; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); + + if (len < max_len) + netfs_extend_writeback(mapping, group, xas, &count, start, + max_len, caching, &len, &wreq->upper_len); + } + +cant_expand: + len = min_t(unsigned long long, len, i_size - start); + + /* We now have a contiguous set of dirty pages, each with writeback + * set; the first page is still locked at this point, but all the rest + * have been unlocked. + */ + folio_unlock(folio); + wreq->start = start; + wreq->len = len; + + if (start < i_size) { + _debug("write back %zx @%llx [%llx]", len, start, i_size); + + /* Speculatively write to the cache. We have to fix this up + * later if the store fails. + */ + wreq->cleanup = netfs_cleanup_buffered_write; + + iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start, + wreq->upper_len); + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback); + if (ret == 0 || ret == -EIOCBQUEUED) + wbc->nr_to_write -= len / PAGE_SIZE; + } else { + _debug("write discard %zx @%llx [%llx]", len, start, i_size); + + /* The dirty region was entirely beyond the EOF. */ + fscache_clear_page_bits(mapping, start, len, caching); + netfs_pages_written_back(wreq); + ret = 0; + } + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + _leave(" = 1"); + return 1; +} + +/* + * Write a region of pages back to the server + */ +static ssize_t netfs_writepages_begin(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + struct xa_state *xas, + unsigned long long *_start, + unsigned long long end) +{ + const struct netfs_folio *finfo; + struct folio *folio; + unsigned long long start = *_start; + ssize_t ret; + void *priv; + int skips = 0; + + _enter("%llx,%llx,", start, end); + +search_again: + /* Find the first dirty page in the group. */ + rcu_read_lock(); + + for (;;) { + folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY); + if (xas_retry(xas, folio) || xa_is_value(folio)) + continue; + if (!folio) + break; + + if (!folio_try_get_rcu(folio)) { + xas_reset(xas); + continue; + } + + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); + xas_reset(xas); + continue; + } + + /* Skip any dirty folio that's not in the group of interest. */ + priv = folio_get_private(folio); + if ((const struct netfs_group *)priv != group) { + finfo = netfs_folio_info(folio); + if (finfo->netfs_group != group) { + folio_put(folio); + continue; + } + } + + xas_pause(xas); + break; + } + rcu_read_unlock(); + if (!folio) + return 0; + + start = folio_pos(folio); /* May regress with THPs */ + + _debug("wback %lx", folio->index); + + /* At this point we hold neither the i_pages lock nor the page lock: + * the page may be truncated or invalidated (changing page->mapping to + * NULL), or even swizzled back from swapper_space to tmpfs file + * mapping + */ +lock_again: + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) + return ret; + } else { + if (!folio_trylock(folio)) + goto search_again; + } + + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + goto search_again; + } + + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); +#ifdef CONFIG_FSCACHE + folio_wait_fscache(folio); +#endif + goto lock_again; + } + + start += folio_size(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) { + ret = 0; + goto out; + } + skips++; + } + goto search_again; + } + + ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas, + folio, start, end); +out: + if (ret > 0) + *_start = start + ret; + _leave(" = %zd [%llx]", ret, *_start); + return ret; +} + +/* + * Write a region of pages back to the server + */ +static int netfs_writepages_region(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + unsigned long long *_start, + unsigned long long end) +{ + ssize_t ret; + + XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE); + + do { + ret = netfs_writepages_begin(mapping, wbc, group, &xas, + _start, end); + if (ret > 0 && wbc->nr_to_write > 0) + cond_resched(); + } while (ret > 0 && wbc->nr_to_write > 0); + + return ret > 0 ? 0 : ret; +} + +/* + * write some of the pending data back to the server + */ +int netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct netfs_group *group = NULL; + loff_t start, end; + int ret; + + _enter(""); + + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ + + if (wbc->range_cyclic && mapping->writeback_index) { + start = mapping->writeback_index * PAGE_SIZE; + ret = netfs_writepages_region(mapping, wbc, group, + &start, LLONG_MAX); + if (ret < 0) + goto out; + + if (wbc->nr_to_write <= 0) { + mapping->writeback_index = start / PAGE_SIZE; + goto out; + } + + start = 0; + end = mapping->writeback_index * PAGE_SIZE; + mapping->writeback_index = 0; + ret = netfs_writepages_region(mapping, wbc, group, &start, end); + if (ret == 0) + mapping->writeback_index = start / PAGE_SIZE; + } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { + start = 0; + ret = netfs_writepages_region(mapping, wbc, group, + &start, LLONG_MAX); + if (wbc->nr_to_write > 0 && ret == 0) + mapping->writeback_index = start / PAGE_SIZE; + } else { + start = wbc->range_start; + ret = netfs_writepages_region(mapping, wbc, group, + &start, wbc->range_end); + } + +out: + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_writepages); + +/* + * Deal with the disposition of a laundered folio. + */ +static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq) +{ + if (wreq->error) { + pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error); + mapping_set_error(wreq->mapping, wreq->error); + } +} + +/** + * netfs_launder_folio - Clean up a dirty folio that's being invalidated + * @folio: The folio to clean + * + * This is called to write back a folio that's being invalidated when an inode + * is getting torn down. Ideally, writepages would be used instead. + */ +int netfs_launder_folio(struct folio *folio) +{ + struct netfs_io_request *wreq; + struct address_space *mapping = folio->mapping; + struct netfs_folio *finfo = netfs_folio_info(folio); + struct netfs_group *group = netfs_folio_group(folio); + struct bio_vec bvec; + unsigned long long i_size = i_size_read(mapping->host); + unsigned long long start = folio_pos(folio); + size_t offset = 0, len; + int ret = 0; + + if (finfo) { + offset = finfo->dirty_offset; + start += offset; + len = finfo->dirty_len; + } else { + len = folio_size(folio); + } + len = min_t(unsigned long long, len, i_size - start); + + wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE); + if (IS_ERR(wreq)) { + ret = PTR_ERR(wreq); + goto out; + } + + if (!folio_clear_dirty_for_io(folio)) + goto out_put; + + trace_netfs_folio(folio, netfs_folio_trace_launder); + + _debug("launder %llx-%llx", start, start + len - 1); + + /* Speculatively write to the cache. We have to fix this up later if + * the store fails. + */ + wreq->cleanup = netfs_cleanup_launder_folio; + + bvec_set_folio(&bvec, folio, len, offset); + iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len); + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + ret = netfs_begin_write(wreq, true, netfs_write_trace_launder); + +out_put: + folio_detach_private(folio); + netfs_put_group(group); + kfree(finfo); + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); +out: + folio_wait_fscache(folio); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_launder_folio); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c new file mode 100644 index 000000000000..ad4370b3935d --- /dev/null +++ b/fs/netfs/direct_read.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Direct I/O support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/netfs.h> +#include "internal.h" + +/** + * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read + * @iocb: The I/O control descriptor describing the read + * @iter: The output buffer (also specifies read length) + * + * Perform an unbuffered I/O or direct I/O from the file in @iocb to the + * output buffer. No use is made of the pagecache. + * + * The caller must hold any appropriate locks. + */ +static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) +{ + struct netfs_io_request *rreq; + ssize_t ret; + size_t orig_count = iov_iter_count(iter); + bool async = !is_sync_kiocb(iocb); + + _enter(""); + + if (!orig_count) + return 0; /* Don't update atime */ + + ret = kiocb_write_and_wait(iocb, orig_count); + if (ret < 0) + return ret; + file_accessed(iocb->ki_filp); + + rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, + iocb->ki_pos, orig_count, + NETFS_DIO_READ); + if (IS_ERR(rreq)) + return PTR_ERR(rreq); + + netfs_stat(&netfs_n_rh_dio_read); + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read); + + /* If this is an async op, we have to keep track of the destination + * buffer for ourselves as the caller's iterator will be trashed when + * we return. + * + * In such a case, extract an iterator to represent as much of the the + * output buffer as we can manage. Note that the extraction might not + * be able to allocate a sufficiently large bvec array and may shorten + * the request. + */ + if (user_backed_iter(iter)) { + ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0); + if (ret < 0) + goto out; + rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec; + rreq->direct_bv_count = ret; + rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); + rreq->len = iov_iter_count(&rreq->iter); + } else { + rreq->iter = *iter; + rreq->len = orig_count; + rreq->direct_bv_unpin = false; + iov_iter_advance(iter, orig_count); + } + + // TODO: Set up bounce buffer if needed + + if (async) + rreq->iocb = iocb; + + ret = netfs_begin_read(rreq, is_sync_kiocb(iocb)); + if (ret < 0) + goto out; /* May be -EIOCBQUEUED */ + if (!async) { + // TODO: Copy from bounce buffer + iocb->ki_pos += rreq->transferred; + ret = rreq->transferred; + } + +out: + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + if (ret > 0) + orig_count -= ret; + if (ret != -EIOCBQUEUED) + iov_iter_revert(iter, orig_count - iov_iter_count(iter)); + return ret; +} + +/** + * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read + * @iocb: The I/O control descriptor describing the read + * @iter: The output buffer (also specifies read length) + * + * Perform an unbuffered I/O or direct I/O from the file in @iocb to the + * output buffer. No use is made of the pagecache. + */ +ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (!iter->count) + return 0; /* Don't update atime */ + + ret = netfs_start_io_direct(inode); + if (ret == 0) { + ret = netfs_unbuffered_read_iter_locked(iocb, iter); + netfs_end_io_direct(inode); + } + return ret; +} +EXPORT_SYMBOL(netfs_unbuffered_read_iter); diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c new file mode 100644 index 000000000000..60a40d293c87 --- /dev/null +++ b/fs/netfs/direct_write.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Unbuffered and direct write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/uio.h> +#include "internal.h" + +static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) +{ + struct inode *inode = wreq->inode; + unsigned long long end = wreq->start + wreq->len; + + if (!wreq->error && + i_size_read(inode) < end) { + if (wreq->netfs_ops->update_i_size) + wreq->netfs_ops->update_i_size(inode, end); + else + i_size_write(inode, end); + } +} + +/* + * Perform an unbuffered write where we may have to do an RMW operation on an + * encrypted file. This can also be used for direct I/O writes. + */ +static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group) +{ + struct netfs_io_request *wreq; + unsigned long long start = iocb->ki_pos; + unsigned long long end = start + iov_iter_count(iter); + ssize_t ret, n; + bool async = !is_sync_kiocb(iocb); + + _enter(""); + + /* We're going to need a bounce buffer if what we transmit is going to + * be different in some way to the source buffer, e.g. because it gets + * encrypted/compressed or because it needs expanding to a block size. + */ + // TODO + + _debug("uw %llx-%llx", start, end); + + wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, + start, end - start, + iocb->ki_flags & IOCB_DIRECT ? + NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); + if (IS_ERR(wreq)) + return PTR_ERR(wreq); + + { + /* If this is an async op and we're not using a bounce buffer, + * we have to save the source buffer as the iterator is only + * good until we return. In such a case, extract an iterator + * to represent as much of the the output buffer as we can + * manage. Note that the extraction might not be able to + * allocate a sufficiently large bvec array and may shorten the + * request. + */ + if (async || user_backed_iter(iter)) { + n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0); + if (n < 0) { + ret = n; + goto out; + } + wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec; + wreq->direct_bv_count = n; + wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); + wreq->len = iov_iter_count(&wreq->iter); + } else { + wreq->iter = *iter; + } + + wreq->io_iter = wreq->iter; + } + + /* Copy the data into the bounce buffer and encrypt it. */ + // TODO + + /* Dispatch the write. */ + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + if (async) + wreq->iocb = iocb; + wreq->cleanup = netfs_cleanup_dio_write; + ret = netfs_begin_write(wreq, is_sync_kiocb(iocb), + iocb->ki_flags & IOCB_DIRECT ? + netfs_write_trace_dio_write : + netfs_write_trace_unbuffered_write); + if (ret < 0) { + _debug("begin = %zd", ret); + goto out; + } + + if (!async) { + trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + + ret = wreq->error; + _debug("waited = %zd", ret); + if (ret == 0) { + ret = wreq->transferred; + iocb->ki_pos += ret; + } + } else { + ret = -EIOCBQUEUED; + } + +out: + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} + +/** + * netfs_unbuffered_write_iter - Unbuffered write to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Do an unbuffered write to a file, writing the data directly to the server + * and not lodging the data in the pagecache. + * + * Return: + * * Negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * Number of bytes written, even for truncated writes + */ +ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + unsigned long long end; + ssize_t ret; + + _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + + trace_netfs_write_iter(iocb, from); + netfs_stat(&netfs_n_rh_dio_write); + + ret = netfs_start_io_direct(inode); + if (ret < 0) + return ret; + ret = generic_write_checks(iocb, from); + if (ret < 0) + goto out; + ret = file_remove_privs(file); + if (ret < 0) + goto out; + ret = file_update_time(file); + if (ret < 0) + goto out; + ret = kiocb_invalidate_pages(iocb, iov_iter_count(from)); + if (ret < 0) + goto out; + end = iocb->ki_pos + iov_iter_count(from); + if (end > ictx->zero_point) + ictx->zero_point = end; + + fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), + FSCACHE_INVAL_DIO_WRITE); + ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL); +out: + netfs_end_io_direct(inode); + return ret; +} +EXPORT_SYMBOL(netfs_unbuffered_write_iter); diff --git a/fs/fscache/cache.c b/fs/netfs/fscache_cache.c index d645f8b302a2..9397ed39b0b4 100644 --- a/fs/fscache/cache.c +++ b/fs/netfs/fscache_cache.c @@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache); void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where) { - unsigned int debug_id = cache->debug_id; + unsigned int debug_id; bool zero; int ref; if (IS_ERR_OR_NULL(cache)) return; + debug_id = cache->debug_id; zero = __refcount_dec_and_test(&cache->ref, &ref); trace_fscache_cache(debug_id, ref - 1, where); diff --git a/fs/fscache/cookie.c b/fs/netfs/fscache_cookie.c index bce2492186d0..bce2492186d0 100644 --- a/fs/fscache/cookie.c +++ b/fs/netfs/fscache_cookie.c diff --git a/fs/netfs/fscache_internal.h b/fs/netfs/fscache_internal.h new file mode 100644 index 000000000000..a09b948fcef2 --- /dev/null +++ b/fs/netfs/fscache_internal.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Internal definitions for FS-Cache + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include "internal.h" + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "FS-Cache: " fmt diff --git a/fs/fscache/io.c b/fs/netfs/fscache_io.c index 0d2b8dec8f82..ad572f7ee897 100644 --- a/fs/fscache/io.c +++ b/fs/netfs/fscache_io.c @@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres, } EXPORT_SYMBOL(__fscache_begin_write_operation); -/** - * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback - * @mapping: The mapping the folio belongs to. - * @folio: The folio being dirtied. - * @cookie: The cookie referring to the cache object - * - * Set the dirty flag on a folio and pin an in-use cache object in memory - * so that writeback can later write to it. This is intended - * to be called from the filesystem's ->dirty_folio() method. - * - * Return: true if the dirty flag was set on the folio, false otherwise. - */ -bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio, - struct fscache_cookie *cookie) -{ - struct inode *inode = mapping->host; - bool need_use = false; - - _enter(""); - - if (!filemap_dirty_folio(mapping, folio)) - return false; - if (!fscache_cookie_valid(cookie)) - return true; - - if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { - spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { - inode->i_state |= I_PINNING_FSCACHE_WB; - need_use = true; - } - spin_unlock(&inode->i_lock); - - if (need_use) - fscache_use_cookie(cookie, true); - } - return true; -} -EXPORT_SYMBOL(fscache_dirty_folio); - struct fscache_write_request { struct netfs_cache_resources cache_resources; struct address_space *mapping; @@ -277,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, fscache_access_io_write) < 0) goto abandon_free; - ret = cres->ops->prepare_write(cres, &start, &len, i_size, false); + ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false); if (ret < 0) goto abandon_end; diff --git a/fs/fscache/main.c b/fs/netfs/fscache_main.c index dad85fd84f6f..42e98bb523e3 100644 --- a/fs/fscache/main.c +++ b/fs/netfs/fscache_main.c @@ -8,18 +8,9 @@ #define FSCACHE_DEBUG_LEVEL CACHE #include <linux/module.h> #include <linux/init.h> -#define CREATE_TRACE_POINTS #include "internal.h" - -MODULE_DESCRIPTION("FS Cache Manager"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - -unsigned fscache_debug; -module_param_named(debug, fscache_debug, uint, - S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(fscache_debug, - "FS-Cache debugging mask"); +#define CREATE_TRACE_POINTS +#include <trace/events/fscache.h> EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache); EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume); @@ -71,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len) /* * initialise the fs caching module */ -static int __init fscache_init(void) +int __init fscache_init(void) { int ret = -ENOMEM; @@ -92,7 +83,7 @@ static int __init fscache_init(void) goto error_cookie_jar; } - pr_notice("Loaded\n"); + pr_notice("FS-Cache loaded\n"); return 0; error_cookie_jar: @@ -103,19 +94,15 @@ error_wq: return ret; } -fs_initcall(fscache_init); - /* * clean up on module removal */ -static void __exit fscache_exit(void) +void __exit fscache_exit(void) { _enter(""); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); destroy_workqueue(fscache_wq); - pr_notice("Unloaded\n"); + pr_notice("FS-Cache unloaded\n"); } - -module_exit(fscache_exit); diff --git a/fs/fscache/proc.c b/fs/netfs/fscache_proc.c index dc3b0e9c8cce..874d951bc390 100644 --- a/fs/fscache/proc.c +++ b/fs/netfs/fscache_proc.c @@ -12,41 +12,34 @@ #include "internal.h" /* - * initialise the /proc/fs/fscache/ directory + * Add files to /proc/fs/netfs/. */ int __init fscache_proc_init(void) { - if (!proc_mkdir("fs/fscache", NULL)) - goto error_dir; + if (!proc_symlink("fs/fscache", NULL, "netfs")) + goto error_sym; - if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL, &fscache_caches_seq_ops)) goto error; - if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL, &fscache_volumes_seq_ops)) goto error; - if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL, &fscache_cookies_seq_ops)) goto error; - -#ifdef CONFIG_FSCACHE_STATS - if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, - fscache_stats_show)) - goto error; -#endif - return 0; error: remove_proc_entry("fs/fscache", NULL); -error_dir: +error_sym: return -ENOMEM; } /* - * clean up the /proc/fs/fscache/ directory + * Clean up the /proc/fs/fscache symlink. */ void fscache_proc_cleanup(void) { diff --git a/fs/fscache/stats.c b/fs/netfs/fscache_stats.c index fc94e5e79f1c..add21abdf713 100644 --- a/fs/fscache/stats.c +++ b/fs/netfs/fscache_stats.c @@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space; EXPORT_SYMBOL(fscache_n_no_create_space); atomic_t fscache_n_culled; EXPORT_SYMBOL(fscache_n_culled); +atomic_t fscache_n_dio_misfit; +EXPORT_SYMBOL(fscache_n_dio_misfit); /* * display the general statistics */ -int fscache_stats_show(struct seq_file *m, void *v) +int fscache_stats_show(struct seq_file *m) { - seq_puts(m, "FS-Cache statistics\n"); + seq_puts(m, "-- FS-Cache statistics --\n"); seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n", atomic_read(&fscache_n_cookies), atomic_read(&fscache_n_volumes), @@ -93,10 +95,9 @@ int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_no_create_space), atomic_read(&fscache_n_culled)); - seq_printf(m, "IO : rd=%u wr=%u\n", + seq_printf(m, "IO : rd=%u wr=%u mis=%u\n", atomic_read(&fscache_n_read), - atomic_read(&fscache_n_write)); - - netfs_stats_show(m); + atomic_read(&fscache_n_write), + atomic_read(&fscache_n_dio_misfit)); return 0; } diff --git a/fs/fscache/volume.c b/fs/netfs/fscache_volume.c index cdf991bdd9de..cdf991bdd9de 100644 --- a/fs/fscache/volume.c +++ b/fs/netfs/fscache_volume.c diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 43fac1b14e40..ec7045d24400 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -5,9 +5,13 @@ * Written by David Howells (dhowells@redhat.com) */ +#include <linux/slab.h> +#include <linux/seq_file.h> #include <linux/netfs.h> #include <linux/fscache.h> +#include <linux/fscache-cache.h> #include <trace/events/netfs.h> +#include <trace/events/fscache.h> #ifdef pr_fmt #undef pr_fmt @@ -19,6 +23,8 @@ * buffered_read.c */ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); +int netfs_prefetch_for_write(struct file *file, struct folio *folio, + size_t offset, size_t len); /* * io.c @@ -29,6 +35,41 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync); * main.c */ extern unsigned int netfs_debug; +extern struct list_head netfs_io_requests; +extern spinlock_t netfs_proc_lock; + +#ifdef CONFIG_PROC_FS +static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) +{ + spin_lock(&netfs_proc_lock); + list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests); + spin_unlock(&netfs_proc_lock); +} +static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) +{ + if (!list_empty(&rreq->proc_link)) { + spin_lock(&netfs_proc_lock); + list_del_rcu(&rreq->proc_link); + spin_unlock(&netfs_proc_lock); + } +} +#else +static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {} +static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} +#endif + +/* + * misc.c + */ +#define NETFS_FLAG_PUT_MARK BIT(0) +#define NETFS_FLAG_PAGECACHE_MARK BIT(1) +int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, + struct folio *folio, unsigned int flags, + gfp_t gfp_mask); +int netfs_add_folios_to_buffer(struct xarray *buffer, + struct address_space *mapping, + pgoff_t index, pgoff_t to, gfp_t gfp_mask); +void netfs_clear_buffer(struct xarray *buffer); /* * objects.c @@ -50,9 +91,20 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, } /* + * output.c + */ +int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, + enum netfs_write_trace what); +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); +int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end); +int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb); + +/* * stats.c */ #ifdef CONFIG_NETFS_STATS +extern atomic_t netfs_n_rh_dio_read; +extern atomic_t netfs_n_rh_dio_write; extern atomic_t netfs_n_rh_readahead; extern atomic_t netfs_n_rh_readpage; extern atomic_t netfs_n_rh_rreq; @@ -71,7 +123,15 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_wh_wstream_conflict; +extern atomic_t netfs_n_wh_upload; +extern atomic_t netfs_n_wh_upload_done; +extern atomic_t netfs_n_wh_upload_failed; +extern atomic_t netfs_n_wh_write; +extern atomic_t netfs_n_wh_write_done; +extern atomic_t netfs_n_wh_write_failed; +int netfs_stats_show(struct seq_file *m, void *v); static inline void netfs_stat(atomic_t *stat) { @@ -103,6 +163,176 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) #endif } +/* + * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) +{ + if (netfs_group) + refcount_inc(&netfs_group->ref); + return netfs_group; +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group(struct netfs_group *netfs_group) +{ + if (netfs_group && refcount_dec_and_test(&netfs_group->ref)) + netfs_group->free(netfs_group); +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) +{ + if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref)) + netfs_group->free(netfs_group); +} + +/* + * fscache-cache.c + */ +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_caches_seq_ops; +#endif +bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); +void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); + +static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) +{ + return smp_load_acquire(&cache->state); +} + +static inline bool fscache_cache_is_live(const struct fscache_cache *cache) +{ + return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; +} + +static inline void fscache_set_cache_state(struct fscache_cache *cache, + enum fscache_cache_state new_state) +{ + smp_store_release(&cache->state, new_state); + +} + +static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, + enum fscache_cache_state old_state, + enum fscache_cache_state new_state) +{ + return try_cmpxchg_release(&cache->state, &old_state, new_state); +} + +/* + * fscache-cookie.c + */ +extern struct kmem_cache *fscache_cookie_jar; +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_cookies_seq_ops; +#endif +extern struct timer_list fscache_cookie_lru_timer; + +extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); +extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why); + +static inline void fscache_see_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), + where); +} + +/* + * fscache-main.c + */ +extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); +#ifdef CONFIG_FSCACHE +int __init fscache_init(void); +void __exit fscache_exit(void); +#else +static inline int fscache_init(void) { return 0; } +static inline void fscache_exit(void) {} +#endif + +/* + * fscache-proc.c + */ +#ifdef CONFIG_PROC_FS +extern int __init fscache_proc_init(void); +extern void fscache_proc_cleanup(void); +#else +#define fscache_proc_init() (0) +#define fscache_proc_cleanup() do {} while (0) +#endif + +/* + * fscache-stats.c + */ +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_volumes; +extern atomic_t fscache_n_volumes_collision; +extern atomic_t fscache_n_volumes_nomem; +extern atomic_t fscache_n_cookies; +extern atomic_t fscache_n_cookies_lru; +extern atomic_t fscache_n_cookies_lru_expired; +extern atomic_t fscache_n_cookies_lru_removed; +extern atomic_t fscache_n_cookies_lru_dropped; + +extern atomic_t fscache_n_acquires; +extern atomic_t fscache_n_acquires_ok; +extern atomic_t fscache_n_acquires_oom; + +extern atomic_t fscache_n_invalidates; + +extern atomic_t fscache_n_relinquishes; +extern atomic_t fscache_n_relinquishes_retire; +extern atomic_t fscache_n_relinquishes_dropped; + +extern atomic_t fscache_n_resizes; +extern atomic_t fscache_n_resizes_null; + +static inline void fscache_stat(atomic_t *stat) +{ + atomic_inc(stat); +} + +static inline void fscache_stat_d(atomic_t *stat) +{ + atomic_dec(stat); +} + +#define __fscache_stat(stat) (stat) + +int fscache_stats_show(struct seq_file *m); +#else + +#define __fscache_stat(stat) (NULL) +#define fscache_stat(stat) do {} while (0) +#define fscache_stat_d(stat) do {} while (0) + +static inline int fscache_stats_show(struct seq_file *m) { return 0; } +#endif + +/* + * fscache-volume.c + */ +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_volumes_seq_ops; +#endif + +struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +bool fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why); +void fscache_create_volume(struct fscache_volume *volume, bool wait); + /*****************************************************************************/ /* * debug tracing @@ -143,3 +373,57 @@ do { \ #define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) #define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) #endif + +/* + * assertions + */ +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif /* assert or not */ diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 7f753380e047..e8ff1e61ce79 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -21,12 +21,7 @@ */ static void netfs_clear_unread(struct netfs_io_subrequest *subreq) { - struct iov_iter iter; - - iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - iov_iter_zero(iov_iter_count(&iter), &iter); + iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); } static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, @@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq, enum netfs_read_from_hole read_hole) { struct netfs_cache_resources *cres = &rreq->cache_resources; - struct iov_iter iter; netfs_stat(&netfs_n_rh_read); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - - cres->ops->read(cres, subreq->start, &iter, read_hole, + cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole, netfs_cache_read_terminated, subreq); } @@ -88,6 +78,13 @@ static void netfs_read_from_server(struct netfs_io_request *rreq, struct netfs_io_subrequest *subreq) { netfs_stat(&netfs_n_rh_download); + + if (rreq->origin != NETFS_DIO_READ && + iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) + pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n", + rreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->len, + subreq->transferred, subreq->flags); rreq->netfs_ops->issue_read(subreq); } @@ -127,9 +124,10 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, /* We might have multiple writes from the same huge * folio, but we mustn't unlock a folio more than once. */ - if (have_unlocked && folio_index(folio) <= unlocked) + if (have_unlocked && folio->index <= unlocked) continue; - unlocked = folio_index(folio); + unlocked = folio_next_index(folio) - 1; + trace_netfs_folio(folio, netfs_folio_trace_end_copy); folio_end_fscache(folio); have_unlocked = true; } @@ -201,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) } ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - rreq->i_size, true); + subreq->len, rreq->i_size, true); if (ret < 0) { trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); @@ -260,6 +258,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq, } /* + * Reset the subrequest iterator prior to resubmission. + */ +static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + size_t remaining = subreq->len - subreq->transferred; + size_t count = iov_iter_count(&subreq->io_iter); + + if (count == remaining) + return; + + _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", + rreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->transferred, + subreq->len, rreq->i_size, + subreq->io_iter.iter_type); + + if (count < remaining) + iov_iter_revert(&subreq->io_iter, remaining - count); + else + iov_iter_advance(&subreq->io_iter, count - remaining); +} + +/* * Resubmit any short or failed operations. Returns true if we got the rreq * ref back. */ @@ -287,6 +309,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); atomic_inc(&rreq->nr_outstanding); + netfs_reset_subreq_iter(rreq, subreq); netfs_read_from_server(rreq, subreq); } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { netfs_rreq_short_read(rreq, subreq); @@ -321,6 +344,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq) } /* + * Determine how much we can admit to having read from a DIO read. + */ +static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + unsigned int i; + size_t transferred = 0; + + for (i = 0; i < rreq->direct_bv_count; i++) + flush_dcache_page(rreq->direct_bv[i].bv_page); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->error || subreq->transferred == 0) + break; + transferred += subreq->transferred; + if (subreq->transferred < subreq->len) + break; + } + + for (i = 0; i < rreq->direct_bv_count; i++) + flush_dcache_page(rreq->direct_bv[i].bv_page); + + rreq->transferred = transferred; + task_io_account_read(transferred); + + if (rreq->iocb) { + rreq->iocb->ki_pos += transferred; + if (rreq->iocb->ki_complete) + rreq->iocb->ki_complete( + rreq->iocb, rreq->error ? rreq->error : transferred); + } + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); + inode_dio_end(rreq->inode); +} + +/* * Assess the state of a read request and decide what to do next. * * Note that we could be in an ordinary kernel thread, on a workqueue or in @@ -340,8 +400,12 @@ again: return; } - netfs_rreq_unlock_folios(rreq); + if (rreq->origin != NETFS_DIO_READ) + netfs_rreq_unlock_folios(rreq); + else + netfs_rreq_assess_dio(rreq); + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); @@ -399,9 +463,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; int u; - _enter("[%u]{%llx,%lx},%zd", - subreq->debug_index, subreq->start, subreq->flags, - transferred_or_error); + _enter("R=%x[%x]{%llx,%lx},%zd", + rreq->debug_id, subreq->debug_index, + subreq->start, subreq->flags, transferred_or_error); switch (subreq->source) { case NETFS_READ_FROM_CACHE: @@ -501,15 +565,20 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest */ static enum netfs_io_source netfs_rreq_prepare_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) + struct netfs_io_subrequest *subreq, + struct iov_iter *io_iter) { - enum netfs_io_source source; + enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; + struct netfs_inode *ictx = netfs_inode(rreq->inode); + size_t lsize; _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); - source = netfs_cache_prepare_read(subreq, rreq->i_size); - if (source == NETFS_INVALID_READ) - goto out; + if (rreq->origin != NETFS_DIO_READ) { + source = netfs_cache_prepare_read(subreq, rreq->i_size); + if (source == NETFS_INVALID_READ) + goto out; + } if (source == NETFS_DOWNLOAD_FROM_SERVER) { /* Call out to the netfs to let it shrink the request to fit @@ -518,19 +587,52 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, * to make serial calls, it can indicate a short read and then * we will call it again. */ + if (rreq->origin != NETFS_DIO_READ) { + if (subreq->start >= ictx->zero_point) { + source = NETFS_FILL_WITH_ZEROES; + goto set; + } + if (subreq->len > ictx->zero_point - subreq->start) + subreq->len = ictx->zero_point - subreq->start; + } if (subreq->len > rreq->i_size - subreq->start) subreq->len = rreq->i_size - subreq->start; + if (rreq->rsize && subreq->len > rreq->rsize) + subreq->len = rreq->rsize; if (rreq->netfs_ops->clamp_length && !rreq->netfs_ops->clamp_length(subreq)) { source = NETFS_INVALID_READ; goto out; } + + if (subreq->max_nr_segs) { + lsize = netfs_limit_iter(io_iter, 0, subreq->len, + subreq->max_nr_segs); + if (subreq->len > lsize) { + subreq->len = lsize; + trace_netfs_sreq(subreq, netfs_sreq_trace_limited); + } + } } - if (WARN_ON(subreq->len == 0)) +set: + if (subreq->len > rreq->len) + pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n", + rreq->debug_id, subreq->debug_index, + subreq->len, rreq->len); + + if (WARN_ON(subreq->len == 0)) { source = NETFS_INVALID_READ; + goto out; + } + subreq->source = source; + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + + subreq->io_iter = *io_iter; + iov_iter_truncate(&subreq->io_iter, subreq->len); + iov_iter_advance(io_iter, subreq->len); out: subreq->source = source; trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); @@ -541,6 +643,7 @@ out: * Slice off a piece of a read request and submit an I/O request for it. */ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, + struct iov_iter *io_iter, unsigned int *_debug_index) { struct netfs_io_subrequest *subreq; @@ -552,7 +655,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, subreq->debug_index = (*_debug_index)++; subreq->start = rreq->start + rreq->submitted; - subreq->len = rreq->len - rreq->submitted; + subreq->len = io_iter->count; _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); list_add_tail(&subreq->rreq_link, &rreq->subrequests); @@ -565,7 +668,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, * (the starts must coincide), in which case, we go around the loop * again and ask it to download the next piece. */ - source = netfs_rreq_prepare_read(rreq, subreq); + source = netfs_rreq_prepare_read(rreq, subreq, io_iter); if (source == NETFS_INVALID_READ) goto subreq_failed; @@ -603,6 +706,7 @@ subreq_failed: */ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) { + struct iov_iter io_iter; unsigned int debug_index = 0; int ret; @@ -611,50 +715,71 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); - netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len); return -EIO; } - INIT_WORK(&rreq->work, netfs_rreq_work); + if (rreq->origin == NETFS_DIO_READ) + inode_dio_begin(rreq->inode); - if (sync) - netfs_get_request(rreq, netfs_rreq_trace_get_hold); + // TODO: Use bounce buffer if requested + rreq->io_iter = rreq->iter; + + INIT_WORK(&rreq->work, netfs_rreq_work); /* Chop the read into slices according to what the cache and the netfs * want and submit each one. */ + netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding); atomic_set(&rreq->nr_outstanding, 1); + io_iter = rreq->io_iter; do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) + _debug("submit %llx + %zx >= %llx", + rreq->start, rreq->submitted, rreq->i_size); + if (rreq->origin == NETFS_DIO_READ && + rreq->start + rreq->submitted >= rreq->i_size) + break; + if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index)) + break; + if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && + test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) break; } while (rreq->submitted < rreq->len); + if (!rreq->submitted) { + netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + ret = 0; + goto out; + } + if (sync) { - /* Keep nr_outstanding incremented so that the ref always belongs to - * us, and the service code isn't punted off to a random thread pool to - * process. + /* Keep nr_outstanding incremented so that the ref always + * belongs to us, and the service code isn't punted off to a + * random thread pool to process. Note that this might start + * further work, such as writing to the cache. */ - for (;;) { - wait_var_event(&rreq->nr_outstanding, - atomic_read(&rreq->nr_outstanding) == 1); + wait_var_event(&rreq->nr_outstanding, + atomic_read(&rreq->nr_outstanding) == 1); + if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_assess(rreq, false); - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - break; - cond_resched(); - } + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len) { + if (ret == 0 && rreq->submitted < rreq->len && + rreq->origin != NETFS_DIO_READ) { trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); ret = -EIO; } - netfs_put_request(rreq, false, netfs_rreq_trace_put_hold); } else { /* If we decrement nr_outstanding to 0, the ref belongs to us. */ if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_assess(rreq, false); - ret = 0; + ret = -EIOCBQUEUED; } + +out: return ret; } diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index 2ff07ba655a0..b781bbbf1d8d 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, return npages; } EXPORT_SYMBOL_GPL(netfs_extract_user_iter); + +/* + * Select the span of a bvec iterator we're going to use. Limit it by both maximum + * size and maximum number of segments. Returns the size of the span in bytes. + */ +static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + const struct bio_vec *bvecs = iter->bvec; + unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0; + size_t len, span = 0, n = iter->count; + size_t skip = iter->iov_offset + start_offset; + + if (WARN_ON(!iov_iter_is_bvec(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + + while (n && ix < nbv && skip) { + len = bvecs[ix].bv_len; + if (skip < len) + break; + skip -= len; + n -= len; + ix++; + } + + while (n && ix < nbv) { + len = min3(n, bvecs[ix].bv_len - skip, max_size); + span += len; + nsegs++; + ix++; + if (span >= max_size || nsegs >= max_segs) + break; + skip = 0; + n -= len; + } + + return min(span, max_size); +} + +/* + * Select the span of an xarray iterator we're going to use. Limit it by both + * maximum size and maximum number of segments. It is assumed that segments + * can be larger than a page in size, provided they're physically contiguous. + * Returns the size of the span in bytes. + */ +static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + struct folio *folio; + unsigned int nsegs = 0; + loff_t pos = iter->xarray_start + iter->iov_offset; + pgoff_t index = pos / PAGE_SIZE; + size_t span = 0, n = iter->count; + + XA_STATE(xas, iter->xarray, index); + + if (WARN_ON(!iov_iter_is_xarray(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + max_size = min(max_size, n - start_offset); + + rcu_read_lock(); + xas_for_each(&xas, folio, ULONG_MAX) { + size_t offset, flen, len; + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + flen = folio_size(folio); + offset = offset_in_folio(folio, pos); + len = min(max_size, flen - offset); + span += len; + nsegs++; + if (span >= max_size || nsegs >= max_segs) + break; + } + + rcu_read_unlock(); + return min(span, max_size); +} + +size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + if (iov_iter_is_bvec(iter)) + return netfs_limit_bvec(iter, start_offset, max_size, max_segs); + if (iov_iter_is_xarray(iter)) + return netfs_limit_xarray(iter, start_offset, max_size, max_segs); + BUG(); +} +EXPORT_SYMBOL(netfs_limit_iter); diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c new file mode 100644 index 000000000000..75dc52a49b3a --- /dev/null +++ b/fs/netfs/locking.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * I/O and data path helper functionality. + * + * Borrowed from NFS Copyright (c) 2016 Trond Myklebust + */ + +#include <linux/kernel.h> +#include <linux/netfs.h> +#include "internal.h" + +/* + * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +static int inode_dio_wait_interruptible(struct inode *inode) +{ + if (!atomic_read(&inode->i_dio_count)) + return 0; + + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + for (;;) { + prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE); + if (!atomic_read(&inode->i_dio_count)) + break; + if (signal_pending(current)) + break; + schedule(); + } + finish_wait(wq, &q.wq_entry); + + return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0; +} + +/* Call with exclusively locked inode->i_rwsem */ +static int netfs_block_o_direct(struct netfs_inode *ictx) +{ + if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) + return 0; + clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + return inode_dio_wait_interruptible(&ictx->inode); +} + +/** + * netfs_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NETFS_ICTX_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +int netfs_start_io_read(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + /* Be an optimist! */ + if (down_read_interruptible(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (netfs_block_o_direct(ictx) < 0) { + up_write(&inode->i_rwsem); + return -ERESTARTSYS; + } + downgrade_write(&inode->i_rwsem); + return 0; +} +EXPORT_SYMBOL(netfs_start_io_read); + +/** + * netfs_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void netfs_end_io_read(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_read(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_read); + +/** + * netfs_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +int netfs_start_io_write(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (netfs_block_o_direct(ictx) < 0) { + up_write(&inode->i_rwsem); + return -ERESTARTSYS; + } + return 0; +} +EXPORT_SYMBOL(netfs_start_io_write); + +/** + * netfs_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void netfs_end_io_write(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_write(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_write); + +/* Call with exclusively locked inode->i_rwsem */ +static int netfs_block_buffered(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + int ret; + + if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) { + set_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + if (inode->i_mapping->nrpages != 0) { + unmap_mapping_range(inode->i_mapping, 0, 0, 0); + ret = filemap_fdatawait(inode->i_mapping); + if (ret < 0) { + clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + return ret; + } + } + } + return 0; +} + +/** + * netfs_start_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NETFS_ICTX_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +int netfs_start_io_direct(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + int ret; + + /* Be an optimist! */ + if (down_read_interruptible(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + ret = netfs_block_buffered(inode); + if (ret < 0) { + up_write(&inode->i_rwsem); + return ret; + } + downgrade_write(&inode->i_rwsem); + return 0; +} +EXPORT_SYMBOL(netfs_start_io_direct); + +/** + * netfs_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void netfs_end_io_direct(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_read(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_direct); diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 068568702957..5e77618a7940 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -7,6 +7,8 @@ #include <linux/module.h> #include <linux/export.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/netfs.h> @@ -15,6 +17,113 @@ MODULE_DESCRIPTION("Network fs support"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); +EXPORT_TRACEPOINT_SYMBOL(netfs_sreq); + unsigned netfs_debug; module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); + +#ifdef CONFIG_PROC_FS +LIST_HEAD(netfs_io_requests); +DEFINE_SPINLOCK(netfs_proc_lock); + +static const char *netfs_origins[nr__netfs_io_origin] = { + [NETFS_READAHEAD] = "RA", + [NETFS_READPAGE] = "RP", + [NETFS_READ_FOR_WRITE] = "RW", + [NETFS_WRITEBACK] = "WB", + [NETFS_WRITETHROUGH] = "WT", + [NETFS_LAUNDER_WRITE] = "LW", + [NETFS_UNBUFFERED_WRITE] = "UW", + [NETFS_DIO_READ] = "DR", + [NETFS_DIO_WRITE] = "DW", +}; + +/* + * Generate a list of I/O requests in /proc/fs/netfs/requests + */ +static int netfs_requests_seq_show(struct seq_file *m, void *v) +{ + struct netfs_io_request *rreq; + + if (v == &netfs_io_requests) { + seq_puts(m, + "REQUEST OR REF FL ERR OPS COVERAGE\n" + "======== == === == ==== === =========\n" + ); + return 0; + } + + rreq = list_entry(v, struct netfs_io_request, proc_link); + seq_printf(m, + "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx", + rreq->debug_id, + netfs_origins[rreq->origin], + refcount_read(&rreq->ref), + rreq->flags, + rreq->error, + atomic_read(&rreq->nr_outstanding), + rreq->start, rreq->submitted, rreq->len); + seq_putc(m, '\n'); + return 0; +} + +static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) +{ + rcu_read_lock(); + return seq_list_start_head(&netfs_io_requests, *_pos); +} + +static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &netfs_io_requests, _pos); +} + +static void netfs_requests_seq_stop(struct seq_file *m, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +static const struct seq_operations netfs_requests_seq_ops = { + .start = netfs_requests_seq_start, + .next = netfs_requests_seq_next, + .stop = netfs_requests_seq_stop, + .show = netfs_requests_seq_show, +}; +#endif /* CONFIG_PROC_FS */ + +static int __init netfs_init(void) +{ + int ret = -ENOMEM; + + if (!proc_mkdir("fs/netfs", NULL)) + goto error; + if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL, + &netfs_requests_seq_ops)) + goto error_proc; +#ifdef CONFIG_FSCACHE_STATS + if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, + netfs_stats_show)) + goto error_proc; +#endif + + ret = fscache_init(); + if (ret < 0) + goto error_proc; + return 0; + +error_proc: + remove_proc_entry("fs/netfs", NULL); +error: + return ret; +} +fs_initcall(netfs_init); + +static void __exit netfs_exit(void) +{ + fscache_exit(); + remove_proc_entry("fs/netfs", NULL); +} +module_exit(netfs_exit); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c new file mode 100644 index 000000000000..90051ced8e2a --- /dev/null +++ b/fs/netfs/misc.c @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Miscellaneous routines. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/swap.h> +#include "internal.h" + +/* + * Attach a folio to the buffer and maybe set marks on it to say that we need + * to put the folio later and twiddle the pagecache flags. + */ +int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, + struct folio *folio, unsigned int flags, + gfp_t gfp_mask) +{ + XA_STATE_ORDER(xas, xa, index, folio_order(folio)); + +retry: + xas_lock(&xas); + for (;;) { + xas_store(&xas, folio); + if (!xas_error(&xas)) + break; + xas_unlock(&xas); + if (!xas_nomem(&xas, gfp_mask)) + return xas_error(&xas); + goto retry; + } + + if (flags & NETFS_FLAG_PUT_MARK) + xas_set_mark(&xas, NETFS_BUF_PUT_MARK); + if (flags & NETFS_FLAG_PAGECACHE_MARK) + xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK); + xas_unlock(&xas); + return xas_error(&xas); +} + +/* + * Create the specified range of folios in the buffer attached to the read + * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that + * these need freeing later. + */ +int netfs_add_folios_to_buffer(struct xarray *buffer, + struct address_space *mapping, + pgoff_t index, pgoff_t to, gfp_t gfp_mask) +{ + struct folio *folio; + int ret; + + if (to + 1 == index) /* Page range is inclusive */ + return 0; + + do { + /* TODO: Figure out what order folio can be allocated here */ + folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0); + if (!folio) + return -ENOMEM; + folio->index = index; + ret = netfs_xa_store_and_mark(buffer, index, folio, + NETFS_FLAG_PUT_MARK, gfp_mask); + if (ret < 0) { + folio_put(folio); + return ret; + } + + index += folio_nr_pages(folio); + } while (index <= to && index != 0); + + return 0; +} + +/* + * Clear an xarray buffer, putting a ref on the folios that have + * NETFS_BUF_PUT_MARK set. + */ +void netfs_clear_buffer(struct xarray *buffer) +{ + struct folio *folio; + XA_STATE(xas, buffer, 0); + + rcu_read_lock(); + xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) { + folio_put(folio); + } + rcu_read_unlock(); + xa_destroy(buffer); +} + +/** + * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback + * @mapping: The mapping the folio belongs to. + * @folio: The folio being dirtied. + * + * Set the dirty flag on a folio and pin an in-use cache object in memory so + * that writeback can later write to it. This is intended to be called from + * the filesystem's ->dirty_folio() method. + * + * Return: true if the dirty flag was set on the folio, false otherwise. + */ +bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + struct inode *inode = mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + struct fscache_cookie *cookie = netfs_i_cookie(ictx); + bool need_use = false; + + _enter(""); + + if (!filemap_dirty_folio(mapping, folio)) + return false; + if (!fscache_cookie_valid(cookie)) + return true; + + if (!(inode->i_state & I_PINNING_NETFS_WB)) { + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_PINNING_NETFS_WB)) { + inode->i_state |= I_PINNING_NETFS_WB; + need_use = true; + } + spin_unlock(&inode->i_lock); + + if (need_use) + fscache_use_cookie(cookie, true); + } + return true; +} +EXPORT_SYMBOL(netfs_dirty_folio); + +/** + * netfs_unpin_writeback - Unpin writeback resources + * @inode: The inode on which the cookie resides + * @wbc: The writeback control + * + * Unpin the writeback resources pinned by netfs_dirty_folio(). This is + * intended to be called as/by the netfs's ->write_inode() method. + */ +int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc) +{ + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + + if (wbc->unpinned_netfs_wb) + fscache_unuse_cookie(cookie, NULL, NULL); + return 0; +} +EXPORT_SYMBOL(netfs_unpin_writeback); + +/** + * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode + * @inode: The inode to clean up + * @aux: Auxiliary data to apply to the inode + * + * Clear any writeback resources held by an inode when the inode is evicted. + * This must be called before clear_inode() is called. + */ +void netfs_clear_inode_writeback(struct inode *inode, const void *aux) +{ + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + + if (inode->i_state & I_PINNING_NETFS_WB) { + loff_t i_size = i_size_read(inode); + fscache_unuse_cookie(cookie, aux, &i_size); + } +} +EXPORT_SYMBOL(netfs_clear_inode_writeback); + +/** + * netfs_invalidate_folio - Invalidate or partially invalidate a folio + * @folio: Folio proposed for release + * @offset: Offset of the invalidated region + * @length: Length of the invalidated region + * + * Invalidate part or all of a folio for a network filesystem. The folio will + * be removed afterwards if the invalidated region covers the entire folio. + */ +void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + struct netfs_folio *finfo = NULL; + size_t flen = folio_size(folio); + + _enter("{%lx},%zx,%zx", folio->index, offset, length); + + folio_wait_fscache(folio); + + if (!folio_test_private(folio)) + return; + + finfo = netfs_folio_info(folio); + + if (offset == 0 && length >= flen) + goto erase_completely; + + if (finfo) { + /* We have a partially uptodate page from a streaming write. */ + unsigned int fstart = finfo->dirty_offset; + unsigned int fend = fstart + finfo->dirty_len; + unsigned int end = offset + length; + + if (offset >= fend) + return; + if (end <= fstart) + return; + if (offset <= fstart && end >= fend) + goto erase_completely; + if (offset <= fstart && end > fstart) + goto reduce_len; + if (offset > fstart && end >= fend) + goto move_start; + /* A partial write was split. The caller has already zeroed + * it, so just absorb the hole. + */ + } + return; + +erase_completely: + netfs_put_group(netfs_folio_group(folio)); + folio_detach_private(folio); + folio_clear_uptodate(folio); + kfree(finfo); + return; +reduce_len: + finfo->dirty_len = offset + length - finfo->dirty_offset; + return; +move_start: + finfo->dirty_len -= offset - finfo->dirty_offset; + finfo->dirty_offset = offset; +} +EXPORT_SYMBOL(netfs_invalidate_folio); + +/** + * netfs_release_folio - Try to release a folio + * @folio: Folio proposed for release + * @gfp: Flags qualifying the release + * + * Request release of a folio and clean up its private state if it's not busy. + * Returns true if the folio can now be released, false if not + */ +bool netfs_release_folio(struct folio *folio, gfp_t gfp) +{ + struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); + unsigned long long end; + + end = folio_pos(folio) + folio_size(folio); + if (end > ctx->zero_point) + ctx->zero_point = end; + + if (folio_test_private(folio)) + return false; + if (folio_test_fscache(folio)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_fscache(folio); + } + + fscache_note_page_release(netfs_i_cookie(ctx)); + return true; +} +EXPORT_SYMBOL(netfs_release_folio); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e17cdf53f6a7..610ceb5bd86c 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -20,14 +20,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct inode *inode = file ? file_inode(file) : mapping->host; struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; + bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || + origin == NETFS_DIO_READ || + origin == NETFS_DIO_WRITE); + bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret; - rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); + rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request), + GFP_KERNEL); if (!rreq) return ERR_PTR(-ENOMEM); rreq->start = start; rreq->len = len; + rreq->upper_len = len; rreq->origin = origin; rreq->netfs_ops = ctx->ops; rreq->mapping = mapping; @@ -35,8 +41,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->i_size = i_size_read(inode); rreq->debug_id = atomic_inc_return(&debug_ids); INIT_LIST_HEAD(&rreq->subrequests); + INIT_WORK(&rreq->work, NULL); refcount_set(&rreq->ref, 1); + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + if (cached) + __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); + if (file && file->f_flags & O_NONBLOCK) + __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { @@ -45,6 +57,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); + netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); return rreq; } @@ -74,33 +88,47 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); + netfs_proc_del_rreq(rreq); netfs_clear_subrequests(rreq, false); if (rreq->netfs_ops->free_request) rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); + if (rreq->direct_bv) { + for (i = 0; i < rreq->direct_bv_count; i++) { + if (rreq->direct_bv[i].bv_page) { + if (rreq->direct_bv_unpin) + unpin_user_page(rreq->direct_bv[i].bv_page); + } + } + kvfree(rreq->direct_bv); + } + kfree_rcu(rreq, rcu); netfs_stat_d(&netfs_n_rh_rreq); } void netfs_put_request(struct netfs_io_request *rreq, bool was_async, enum netfs_rreq_ref_trace what) { - unsigned int debug_id = rreq->debug_id; + unsigned int debug_id; bool dead; int r; - dead = __refcount_dec_and_test(&rreq->ref, &r); - trace_netfs_rreq_ref(debug_id, r - 1, what); - if (dead) { - if (was_async) { - rreq->work.func = netfs_free_request; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_free_request(&rreq->work); + if (rreq) { + debug_id = rreq->debug_id; + dead = __refcount_dec_and_test(&rreq->ref, &r); + trace_netfs_rreq_ref(debug_id, r - 1, what); + if (dead) { + if (was_async) { + rreq->work.func = netfs_free_request; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_free_request(&rreq->work); + } } } } @@ -112,8 +140,11 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq { struct netfs_io_subrequest *subreq; - subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL); + subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?: + sizeof(struct netfs_io_subrequest), + GFP_KERNEL); if (subreq) { + INIT_WORK(&subreq->work, NULL); INIT_LIST_HEAD(&subreq->rreq_link); refcount_set(&subreq->ref, 2); subreq->rreq = rreq; @@ -140,6 +171,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; trace_netfs_sreq(subreq, netfs_sreq_trace_free); + if (rreq->netfs_ops->free_subrequest) + rreq->netfs_ops->free_subrequest(subreq); kfree(subreq); netfs_stat_d(&netfs_n_rh_sreq); netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); diff --git a/fs/netfs/output.c b/fs/netfs/output.c new file mode 100644 index 000000000000..625eb68f3e5a --- /dev/null +++ b/fs/netfs/output.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "internal.h" + +/** + * netfs_create_write_request - Create a write operation. + * @wreq: The write request this is storing from. + * @dest: The destination type + * @start: Start of the region this write will modify + * @len: Length of the modification + * @worker: The worker function to handle the write(s) + * + * Allocate a write operation, set it up and add it to the list on a write + * request. + */ +struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq, + enum netfs_io_source dest, + loff_t start, size_t len, + work_func_t worker) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_alloc_subrequest(wreq); + if (subreq) { + INIT_WORK(&subreq->work, worker); + subreq->source = dest; + subreq->start = start; + subreq->len = len; + subreq->debug_index = wreq->subreq_counter++; + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); + break; + default: + BUG(); + } + + subreq->io_iter = wreq->io_iter; + iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start); + iov_iter_truncate(&subreq->io_iter, subreq->len); + + trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + atomic_inc(&wreq->nr_outstanding); + list_add_tail(&subreq->rreq_link, &wreq->subrequests); + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + } + + return subreq; +} +EXPORT_SYMBOL(netfs_create_write_request); + +/* + * Process a completed write request once all the component operations have + * been completed. + */ +static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async) +{ + struct netfs_io_subrequest *subreq; + struct netfs_inode *ctx = netfs_inode(wreq->inode); + size_t transferred = 0; + + _enter("R=%x[]", wreq->debug_id); + + trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); + + list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { + if (subreq->error || subreq->transferred == 0) + break; + transferred += subreq->transferred; + if (subreq->transferred < subreq->len) + break; + } + wreq->transferred = transferred; + + list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { + if (!subreq->error) + continue; + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + /* Depending on the type of failure, this may prevent + * writeback completion unless we're in disconnected + * mode. + */ + if (!wreq->error) + wreq->error = subreq->error; + break; + + case NETFS_WRITE_TO_CACHE: + /* Failure doesn't prevent writeback completion unless + * we're in disconnected mode. + */ + if (subreq->error != -ENOBUFS) + ctx->ops->invalidate_cache(wreq); + break; + + default: + WARN_ON_ONCE(1); + if (!wreq->error) + wreq->error = -EIO; + return; + } + } + + wreq->cleanup(wreq); + + if (wreq->origin == NETFS_DIO_WRITE && + wreq->mapping->nrpages) { + pgoff_t first = wreq->start >> PAGE_SHIFT; + pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; + invalidate_inode_pages2_range(wreq->mapping, first, last); + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_end(wreq->inode); + + _debug("finished"); + trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); + wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + + if (wreq->iocb) { + wreq->iocb->ki_pos += transferred; + if (wreq->iocb->ki_complete) + wreq->iocb->ki_complete( + wreq->iocb, wreq->error ? wreq->error : transferred); + } + + netfs_clear_subrequests(wreq, was_async); + netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete); +} + +/* + * Deal with the completion of writing the data to the cache. + */ +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = _op; + struct netfs_io_request *wreq = subreq->rreq; + unsigned int u; + + _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_done); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_done); + break; + case NETFS_INVALID_WRITE: + break; + default: + BUG(); + } + + if (IS_ERR_VALUE(transferred_or_error)) { + subreq->error = transferred_or_error; + trace_netfs_failure(wreq, subreq, transferred_or_error, + netfs_fail_write); + goto failed; + } + + if (WARN(transferred_or_error > subreq->len - subreq->transferred, + "Subreq excess write: R%x[%x] %zd > %zu - %zu", + wreq->debug_id, subreq->debug_index, + transferred_or_error, subreq->len, subreq->transferred)) + transferred_or_error = subreq->len - subreq->transferred; + + subreq->error = 0; + subreq->transferred += transferred_or_error; + + if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) + pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n", + wreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->len, + subreq->transferred, subreq->io_iter.iter_type); + + if (subreq->transferred < subreq->len) + goto incomplete; + + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); +out: + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + /* If we decrement nr_outstanding to 0, the ref belongs to us. */ + u = atomic_dec_return(&wreq->nr_outstanding); + if (u == 0) + netfs_write_terminated(wreq, was_async); + else if (u == 1) + wake_up_var(&wreq->nr_outstanding); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + return; + +incomplete: + if (transferred_or_error == 0) { + if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + subreq->error = -ENODATA; + goto failed; + } + } else { + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } + + __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); + goto out; + +failed: + switch (subreq->source) { + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_failed); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); + break; + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_failed); + set_bit(NETFS_RREQ_FAILED, &wreq->flags); + wreq->error = subreq->error; + break; + default: + break; + } + goto out; +} +EXPORT_SYMBOL(netfs_write_subrequest_terminated); + +static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *wreq = subreq->rreq; + struct netfs_cache_resources *cres = &wreq->cache_resources; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + + cres->ops->write(cres, subreq->start, &subreq->io_iter, + netfs_write_subrequest_terminated, subreq); +} + +static void netfs_write_to_cache_op_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); + + netfs_write_to_cache_op(subreq); +} + +/** + * netfs_queue_write_request - Queue a write request for attention + * @subreq: The write request to be queued + * + * Queue the specified write request for processing by a worker thread. We + * pass the caller's ref on the request to the worker thread. + */ +void netfs_queue_write_request(struct netfs_io_subrequest *subreq) +{ + if (!queue_work(system_unbound_wq, &subreq->work)) + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip); +} +EXPORT_SYMBOL(netfs_queue_write_request); + +/* + * Set up a op for writing to the cache. + */ +static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq) +{ + struct netfs_cache_resources *cres = &wreq->cache_resources; + struct netfs_io_subrequest *subreq; + struct netfs_inode *ctx = netfs_inode(wreq->inode); + struct fscache_cookie *cookie = netfs_i_cookie(ctx); + loff_t start = wreq->start; + size_t len = wreq->len; + int ret; + + if (!fscache_cookie_enabled(cookie)) { + clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags); + return; + } + + _debug("write to cache"); + ret = fscache_begin_write_operation(cres, cookie); + if (ret < 0) + return; + + ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len, + i_size_read(wreq->inode), true); + if (ret < 0) + return; + + subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len, + netfs_write_to_cache_op_worker); + if (!subreq) + return; + + netfs_write_to_cache_op(subreq); +} + +/* + * Begin the process of writing out a chunk of data. + * + * We are given a write request that holds a series of dirty regions and + * (partially) covers a sequence of folios, all of which are present. The + * pages must have been marked as writeback as appropriate. + * + * We need to perform the following steps: + * + * (1) If encrypting, create an output buffer and encrypt each block of the + * data into it, otherwise the output buffer will point to the original + * folios. + * + * (2) If the data is to be cached, set up a write op for the entire output + * buffer to the cache, if the cache wants to accept it. + * + * (3) If the data is to be uploaded (ie. not merely cached): + * + * (a) If the data is to be compressed, create a compression buffer and + * compress the data into it. + * + * (b) For each destination we want to upload to, set up write ops to write + * to that destination. We may need multiple writes if the data is not + * contiguous or the span exceeds wsize for a server. + */ +int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, + enum netfs_write_trace what) +{ + struct netfs_inode *ctx = netfs_inode(wreq->inode); + + _enter("R=%x %llx-%llx f=%lx", + wreq->debug_id, wreq->start, wreq->start + wreq->len - 1, + wreq->flags); + + trace_netfs_write(wreq, what); + if (wreq->len == 0 || wreq->iter.count == 0) { + pr_err("Zero-sized write [R=%x]\n", wreq->debug_id); + return -EIO; + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_begin(wreq->inode); + + wreq->io_iter = wreq->iter; + + /* ->outstanding > 0 carries a ref */ + netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); + atomic_set(&wreq->nr_outstanding, 1); + + /* Start the encryption/compression going. We can do that in the + * background whilst we generate a list of write ops that we want to + * perform. + */ + // TODO: Encrypt or compress the region as appropriate + + /* We need to write all of the region to the cache */ + if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + netfs_set_up_write_to_cache(wreq); + + /* However, we don't necessarily write all of the region to the server. + * Caching of reads is being managed this way also. + */ + if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) + ctx->ops->create_write_requests(wreq, wreq->start, wreq->len); + + if (atomic_dec_and_test(&wreq->nr_outstanding)) + netfs_write_terminated(wreq, false); + + if (!may_wait) + return -EIOCBQUEUED; + + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + return wreq->error; +} + +/* + * Begin a write operation for writing through the pagecache. + */ +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len) +{ + struct netfs_io_request *wreq; + struct file *file = iocb->ki_filp; + + wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len, + NETFS_WRITETHROUGH); + if (IS_ERR(wreq)) + return wreq; + + trace_netfs_write(wreq, netfs_write_trace_writethrough); + + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0); + wreq->io_iter = wreq->iter; + + /* ->outstanding > 0 carries a ref */ + netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); + atomic_set(&wreq->nr_outstanding, 1); + return wreq; +} + +static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final) +{ + struct netfs_inode *ictx = netfs_inode(wreq->inode); + unsigned long long start; + size_t len; + + if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) + return; + + start = wreq->start + wreq->submitted; + len = wreq->iter.count - wreq->submitted; + if (!final) { + len /= wreq->wsize; /* Round to number of maximum packets */ + len *= wreq->wsize; + } + + ictx->ops->create_write_requests(wreq, start, len); + wreq->submitted += len; +} + +/* + * Advance the state of the write operation used when writing through the + * pagecache. Data has been copied into the pagecache that we need to append + * to the request. If we've added more than wsize then we need to create a new + * subrequest. + */ +int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end) +{ + _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u", + wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end); + + wreq->iter.count += copied; + wreq->io_iter.count += copied; + if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize) + netfs_submit_writethrough(wreq, false); + + return wreq->error; +} + +/* + * End a write operation used when writing through the pagecache. + */ +int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb) +{ + int ret = -EIOCBQUEUED; + + _enter("ic=%zu sb=%zu ws=%u", + wreq->iter.count, wreq->submitted, wreq->wsize); + + if (wreq->submitted < wreq->io_iter.count) + netfs_submit_writethrough(wreq, true); + + if (atomic_dec_and_test(&wreq->nr_outstanding)) + netfs_write_terminated(wreq, false); + + if (is_sync_kiocb(iocb)) { + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + ret = wreq->error; + } + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 5510a7a14a40..deeba9f9dcf5 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -9,6 +9,8 @@ #include <linux/seq_file.h> #include "internal.h" +atomic_t netfs_n_rh_dio_read; +atomic_t netfs_n_rh_dio_write; atomic_t netfs_n_rh_readahead; atomic_t netfs_n_rh_readpage; atomic_t netfs_n_rh_rreq; @@ -27,32 +29,48 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_wh_wstream_conflict; +atomic_t netfs_n_wh_upload; +atomic_t netfs_n_wh_upload_done; +atomic_t netfs_n_wh_upload_failed; +atomic_t netfs_n_wh_write; +atomic_t netfs_n_wh_write_done; +atomic_t netfs_n_wh_write_failed; -void netfs_stats_show(struct seq_file *m) +int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n", + seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n", + atomic_read(&netfs_n_rh_dio_read), + atomic_read(&netfs_n_rh_dio_write), atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_readpage), atomic_read(&netfs_n_rh_write_begin), - atomic_read(&netfs_n_rh_write_zskip), - atomic_read(&netfs_n_rh_rreq), - atomic_read(&netfs_n_rh_sreq)); - seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n", + atomic_read(&netfs_n_rh_write_zskip)); + seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n", atomic_read(&netfs_n_rh_zero), atomic_read(&netfs_n_rh_short_read), atomic_read(&netfs_n_rh_write_zskip)); - seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n", + seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n", atomic_read(&netfs_n_rh_download), atomic_read(&netfs_n_rh_download_done), atomic_read(&netfs_n_rh_download_failed), atomic_read(&netfs_n_rh_download_instead)); - seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n", + seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n", atomic_read(&netfs_n_rh_read), atomic_read(&netfs_n_rh_read_done), atomic_read(&netfs_n_rh_read_failed)); - seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n", - atomic_read(&netfs_n_rh_write), - atomic_read(&netfs_n_rh_write_done), - atomic_read(&netfs_n_rh_write_failed)); + seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n", + atomic_read(&netfs_n_wh_upload), + atomic_read(&netfs_n_wh_upload_done), + atomic_read(&netfs_n_wh_upload_failed)); + seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n", + atomic_read(&netfs_n_wh_write), + atomic_read(&netfs_n_wh_write_done), + atomic_read(&netfs_n_wh_write_failed)); + seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n", + atomic_read(&netfs_n_rh_rreq), + atomic_read(&netfs_n_rh_sreq), + atomic_read(&netfs_n_wh_wstream_conflict)); + return fscache_stats_show(m); } EXPORT_SYMBOL(netfs_stats_show); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 01ac733a6320..f7e32d76e34d 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -169,8 +169,8 @@ config ROOT_NFS config NFS_FSCACHE bool "Provide NFS client caching support" - depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y - select NETFS_SUPPORT + depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y + select FSCACHE help Say Y here if you want NFS data to be cached locally on disc through the general filesystem cache manager diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index b05717fe0d4e..2d1bfee225c3 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -274,12 +274,6 @@ static void nfs_netfs_free_request(struct netfs_io_request *rreq) put_nfs_open_context(rreq->netfs_priv); } -static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq) -{ - return fscache_begin_read_operation(&rreq->cache_resources, - netfs_i_cookie(netfs_inode(rreq->inode))); -} - static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) { struct nfs_netfs_io_data *netfs; @@ -387,7 +381,6 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) const struct netfs_request_ops nfs_netfs_ops = { .init_request = nfs_netfs_init_request, .free_request = nfs_netfs_free_request, - .begin_cache_operation = nfs_netfs_begin_cache_operation, .issue_read = nfs_netfs_issue_read, .clamp_length = nfs_netfs_clamp_length }; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 5407ab8c8783..e3cb4923316b 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -80,7 +80,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) } static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { - netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops); + netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2fa54cfd4882..6dc6340e2852 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -7911,14 +7911,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct nfsd_file *nf = find_any_file(fp); + struct nfsd_file *nf; struct inode *inode; struct file_lock_context *flctx; + spin_lock(&fp->fi_lock); + nf = find_any_file_locked(fp); if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); - return status; + goto out; } inode = file_inode(nf->nf_file); @@ -7934,7 +7936,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } spin_unlock(&flctx->flc_lock); } - nfsd_file_put(nf); +out: + spin_unlock(&fp->fi_lock); return status; } @@ -7944,10 +7947,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) * @cstate: NFSv4 COMPOUND state * @u: RELEASE_LOCKOWNER arguments * - * The lockowner's so_count is bumped when a lock record is added - * or when copying a conflicting lock. The latter case is brief, - * but can lead to fleeting false positives when looking for - * locks-in-use. + * Check if theree are any locks still held and if not - free the lockowner + * and any lock state that is owned. * * Return values: * %nfs_ok: lockowner released or not found @@ -7983,10 +7984,13 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, spin_unlock(&clp->cl_lock); return nfs_ok; } - if (atomic_read(&lo->lo_owner.so_count) != 2) { - spin_unlock(&clp->cl_lock); - nfs4_put_stateowner(&lo->lo_owner); - return nfserr_locks_held; + + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + if (check_for_locks(stp->st_stid.sc_file, lo)) { + spin_unlock(&clp->cl_lock); + nfs4_put_stateowner(&lo->lo_owner); + return nfserr_locks_held; + } } unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 984ffdaeed6c..5764f91d283e 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -18,10 +18,11 @@ struct ovl_lookup_data { struct super_block *sb; - struct vfsmount *mnt; + const struct ovl_layer *layer; struct qstr name; bool is_dir; bool opaque; + bool xwhiteouts; bool stop; bool last; char *redirect; @@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, return real; } -static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path) -{ - return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); -} - static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, const char *name, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, + base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, size_t prelen, const char *post, struct dentry **ret, bool drop_negative) { + struct ovl_fs *ofs = OVL_FS(d->sb); struct dentry *this; struct path path; int err; bool last_element = !post[0]; + bool is_upper = d->layer->idx == 0; + char val; this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { @@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, } path.dentry = this; - path.mnt = d->mnt; - if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) { + path.mnt = d->layer->mnt; + if (ovl_path_is_whiteout(ofs, &path)) { d->stop = d->opaque = true; goto put_and_out; } @@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL); + err = ovl_check_metacopy_xattr(ofs, &path, NULL); if (err < 0) goto out_err; @@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, if (d->last) goto out; - if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) { + /* overlay.opaque=x means xwhiteouts directory */ + val = ovl_get_opaquedir_val(ofs, &path); + if (last_element && !is_upper && val == 'x') { + d->xwhiteouts = true; + ovl_layer_set_xwhiteouts(ofs, d->layer); + } else if (val == 'y') { d->stop = true; if (last_element) d->opaque = true; @@ -863,7 +868,8 @@ fail: * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. */ -int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); @@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) BUG_ON(idx < 0); if (idx == 0) { ovl_path_upper(dentry, path); - if (path->dentry) + if (path->dentry) { + *layer = &OVL_FS(dentry->d_sb)->layers[0]; return ovl_numlower(oe) ? 1 : -1; + } idx++; } BUG_ON(idx > ovl_numlower(oe)); path->dentry = lowerstack[idx - 1].dentry; - path->mnt = lowerstack[idx - 1].layer->mnt; + *layer = lowerstack[idx - 1].layer; + path->mnt = (*layer)->mnt; return (idx < ovl_numlower(oe)) ? idx + 1 : -1; } @@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { - d.mnt = ovl_upper_mnt(ofs); + d.layer = &ofs->layers[0]; err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; @@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, else if (d.is_dir || !ofs->numdatalayer) d.last = lower.layer->idx == ovl_numlower(roe); - d.mnt = lower.layer->mnt; + d.layer = lower.layer; err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; @@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperopaque) ovl_dentry_set_opaque(dentry); + if (d.xwhiteouts) + ovl_dentry_set_xwhiteouts(dentry); if (upperdentry) ovl_dentry_set_upper_alias(dentry); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 5ba11eb43767..ee949f3e7c77 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -50,7 +50,6 @@ enum ovl_xattr { OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, OVL_XATTR_XWHITEOUT, - OVL_XATTR_XWHITEOUTS, }; enum ovl_inode_flag { @@ -70,6 +69,8 @@ enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, OVL_E_CONNECTED, + /* Lower stack may contain xwhiteout entries */ + OVL_E_XWHITEOUTS, }; enum { @@ -477,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry); +void ovl_dentry_set_xwhiteouts(struct dentry *dentry); +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer); bool ovl_dentry_has_upper_alias(struct dentry *dentry); void ovl_dentry_set_upper_alias(struct dentry *dentry); bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); @@ -494,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox); +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path); -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath); @@ -573,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb, .mnt = ovl_upper_mnt(ofs), }; - return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE); + return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y'; +} + +static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs, + const struct path *path) +{ + return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE); } static inline bool ovl_redirect_follow(struct ovl_fs *ofs) @@ -680,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); -int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer); int ovl_verify_lowerdata(struct dentry *dentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 5fa9c58af65f..cb449ab310a7 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -40,6 +40,8 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; + /* xwhiteouts were found on this layer */ + bool has_xwhiteouts; }; struct ovl_path { @@ -59,7 +61,7 @@ struct ovl_fs { unsigned int numfs; /* Number of data-only lower layers */ unsigned int numdatalayer; - const struct ovl_layer *layers; + struct ovl_layer *layers; struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index e71156baa7bc..0ca8af060b0c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); - rdd->in_xwhiteouts_dir = rdd->dentry && - ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { @@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, .is_lowest = false, }; int idx, next; + const struct ovl_layer *layer; for (idx = 0; idx != -1; idx = next) { - next = ovl_path_next(idx, dentry, &realpath); + next = ovl_path_next(idx, dentry, &realpath, &layer); rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; + rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && + ovl_dentry_has_xwhiteouts(dentry); if (next != -1) { err = ovl_dir_read(&realpath, &rdd); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 4ab66e3d4cff..2eef6c70b2ae 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1249,6 +1249,7 @@ static struct dentry *ovl_get_root(struct super_block *sb, struct ovl_entry *oe) { struct dentry *root; + struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path *lowerpath = ovl_lowerstack(oe); unsigned long ino = d_inode(lowerpath->dentry)->i_ino; int fsid = lowerpath->layer->fsid; @@ -1270,6 +1271,20 @@ static struct dentry *ovl_get_root(struct super_block *sb, ovl_set_flag(OVL_IMPURE, d_inode(root)); } + /* Look for xwhiteouts marker except in the lowermost layer */ + for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) { + struct path path = { + .mnt = lowerpath->layer->mnt, + .dentry = lowerpath->dentry, + }; + + /* overlay.opaque=x means xwhiteouts directory */ + if (ovl_get_opaquedir_val(ofs, &path) == 'x') { + ovl_layer_set_xwhiteouts(ofs, lowerpath->layer); + ovl_dentry_set_xwhiteouts(root); + } + } + /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root)); ovl_dentry_set_flag(OVL_E_CONNECTED, root); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 0217094c23ea..a8e17f14d7a2 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry) ovl_dentry_set_flag(OVL_E_OPAQUE, dentry); } +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry) +{ + return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry); +} + +void ovl_dentry_set_xwhiteouts(struct dentry *dentry) +{ + ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry); +} + +/* + * ovl_layer_set_xwhiteouts() is called before adding the overlay dir + * dentry to dcache, while readdir of that same directory happens after + * the overlay dir dentry is in dcache, so if some cpu observes that + * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts + * for the layers where xwhiteouts marker was found in that merge dir. + */ +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer) +{ + if (layer->has_xwhiteouts) + return; + + /* Write once to read-mostly layer properties */ + ofs->layers[layer->idx].has_xwhiteouts = true; +} + /* * For hard links and decoded file handles, it's possible for ovl_dentry_upper() * to return positive, while there's no actual upper alias for the inode. @@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path) return res >= 0; } -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path) -{ - struct dentry *dentry = path->dentry; - int res; - - /* xattr.whiteouts must be a directory */ - if (!d_is_dir(dentry)) - return false; - - res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0); - return res >= 0; -} - /* * Load persistent uuid from xattr into s_uuid if found, or store a new * random generated value in s_uuid and in xattr. @@ -811,20 +825,17 @@ fail: return false; } -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox) +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox) { int res; char val; if (!d_is_dir(path->dentry)) - return false; + return 0; res = ovl_path_getxattr(ofs, path, ox, &val, 1); - if (res == 1 && val == 'y') - return true; - - return false; + return res == 1 ? val : 0; } #define OVL_XATTR_OPAQUE_POSTFIX "opaque" @@ -837,7 +848,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_PROTATTR_POSTFIX "protattr" #define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout" -#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ @@ -854,7 +864,6 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT), - OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS), }; int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index d64a306a414b..971892620504 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -151,7 +151,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, return -EOPNOTSUPP; ses = tcon->ses; - server = ses->server; + server = cifs_pick_channel(ses); cfids = tcon->cfids; if (!server->ops->new_lease_key) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 60027f5aebe8..3e4209f41c18 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -659,6 +659,7 @@ static ssize_t cifs_stats_proc_write(struct file *file, spin_lock(&tcon->stat_lock); tcon->bytes_read = 0; tcon->bytes_written = 0; + tcon->stats_from_time = ktime_get_real_seconds(); spin_unlock(&tcon->stat_lock); if (server->ops->clear_stats) server->ops->clear_stats(tcon); @@ -737,8 +738,9 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) seq_printf(m, "\n%d) %s", i, tcon->tree_name); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); - seq_printf(m, "\nSMBs: %d", - atomic_read(&tcon->num_smbs_sent)); + seq_printf(m, "\nSMBs: %d since %ptTs UTC", + atomic_read(&tcon->num_smbs_sent), + &tcon->stats_from_time); if (server->ops->print_stats) server->ops->print_stats(m, tcon); } diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 99b0ade833aa..e902de4e475a 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -430,7 +430,7 @@ static void cifs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_FSCACHE_WB) + if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); cifs_fscache_release_inode_cookie(inode); clear_inode(inode); @@ -681,6 +681,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize); if (tcon->ses->server->min_offload) seq_printf(s, ",esize=%u", tcon->ses->server->min_offload); + if (tcon->ses->server->retrans) + seq_printf(s, ",retrans=%u", tcon->ses->server->retrans); seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); @@ -793,8 +795,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc) { - fscache_unpin_writeback(wbc, cifs_inode_cookie(inode)); - return 0; + return netfs_unpin_writeback(inode, wbc); } static int cifs_drop_inode(struct inode *inode) @@ -1222,7 +1223,7 @@ static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *s if (rc < 0) goto set_failed; - netfs_resize_file(&src_cifsi->netfs, src_end); + netfs_resize_file(&src_cifsi->netfs, src_end, true); fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end); return 0; @@ -1353,7 +1354,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, smb_file_src, smb_file_target, off, len, destoff); if (rc == 0 && new_size > i_size_read(target_inode)) { truncate_setsize(target_inode, new_size); - netfs_resize_file(&target_cifsi->netfs, new_size); + netfs_resize_file(&target_cifsi->netfs, new_size, true); fscache_resize_cookie(cifs_inode_cookie(target_inode), new_size); } diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 879d5ef8a66e..20036fb16cec 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -204,6 +204,8 @@ struct cifs_open_info_data { }; } reparse; char *symlink_target; + struct cifs_sid posix_owner; + struct cifs_sid posix_group; union { struct smb2_file_all_info fi; struct smb311_posix_qinfo posix_fi; @@ -751,6 +753,7 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; unsigned int min_offload; + unsigned int retrans; __le16 compress_algorithm; __u16 signing_algorithm; __le16 cipher_type; @@ -1207,6 +1210,7 @@ struct cifs_tcon { __u64 bytes_read; __u64 bytes_written; spinlock_t stat_lock; /* protects the two fields above */ + time64_t stats_from_time; FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ FILE_SYSTEM_UNIX_INFO fsUnixInfo; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 3052a208c6ca..bfd568f89710 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1574,6 +1574,9 @@ static int match_server(struct TCP_Server_Info *server, if (server->min_offload != ctx->min_offload) return 0; + if (server->retrans != ctx->retrans) + return 0; + return 1; } @@ -1798,6 +1801,7 @@ smbd_connected: goto out_err_crypto_release; } tcp_ses->min_offload = ctx->min_offload; + tcp_ses->retrans = ctx->retrans; /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 1b4262aff8fa..90da81d0372a 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le xas_for_each(&xas, folio, end) { if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -2651,7 +2651,7 @@ static void cifs_extend_writeback(struct address_space *mapping, continue; if (xa_is_value(folio)) break; - if (folio_index(folio) != index) + if (folio->index != index) break; if (!folio_try_get_rcu(folio)) { xas_reset(&xas); @@ -2899,7 +2899,7 @@ redo_folio: goto skip_write; } - if (folio_mapping(folio) != mapping || + if (folio->mapping != mapping || !folio_test_dirty(folio)) { start += folio_size(folio); folio_unlock(folio); @@ -5043,27 +5043,13 @@ static void cifs_swap_deactivate(struct file *file) /* do we need to unpin (or unlock) the file */ } -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -#ifdef CONFIG_CIFS_FSCACHE -static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - return fscache_dirty_folio(mapping, folio, - cifs_inode_cookie(mapping->host)); -} -#else -#define cifs_dirty_folio filemap_dirty_folio -#endif - const struct address_space_operations cifs_addr_ops = { .read_folio = cifs_read_folio, .readahead = cifs_readahead, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .dirty_folio = cifs_dirty_folio, + .dirty_folio = netfs_dirty_folio, .release_folio = cifs_release_folio, .direct_IO = cifs_direct_io, .invalidate_folio = cifs_invalidate_folio, @@ -5087,7 +5073,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .dirty_folio = cifs_dirty_folio, + .dirty_folio = netfs_dirty_folio, .release_folio = cifs_release_folio, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index a3493da12ad1..52cbef2eeb28 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -139,6 +139,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("dir_mode", Opt_dirmode), fsparam_u32("port", Opt_port), fsparam_u32("min_enc_offload", Opt_min_enc_offload), + fsparam_u32("retrans", Opt_retrans), fsparam_u32("esize", Opt_min_enc_offload), fsparam_u32("bsize", Opt_blocksize), fsparam_u32("rasize", Opt_rasize), @@ -1064,6 +1065,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_min_enc_offload: ctx->min_offload = result.uint_32; break; + case Opt_retrans: + ctx->retrans = result.uint_32; + break; case Opt_blocksize: /* * inode blocksize realistically should never need to be @@ -1619,6 +1623,8 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->backupuid_specified = false; /* no backup intent for a user */ ctx->backupgid_specified = false; /* no backup intent for a group */ + ctx->retrans = 1; + /* * short int override_uid = -1; * short int override_gid = -1; diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index cf46916286d0..182ce11cbe93 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -118,6 +118,7 @@ enum cifs_param { Opt_file_mode, Opt_dirmode, Opt_min_enc_offload, + Opt_retrans, Opt_blocksize, Opt_rasize, Opt_rsize, @@ -245,6 +246,7 @@ struct smb3_fs_context { unsigned int rsize; unsigned int wsize; unsigned int min_offload; + unsigned int retrans; bool sockopt_tcp_nodelay:1; /* attribute cache timemout for files and directories in jiffies */ unsigned long acregmax; diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index e5cad149f5a2..c4a3cb736881 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -180,7 +180,7 @@ static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_ if (ret < 0) return ret; - ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), + ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode), no_space_allocated_yet); if (ret == 0) ret = fscache_write(&cres, start, &iter, NULL, NULL); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 9f37c1758f73..f0989484f2c6 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -665,8 +665,6 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from POSIX info struct */ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group, struct super_block *sb) { struct smb311_posix_qinfo *info = &data->posix_fi; @@ -722,8 +720,8 @@ out_reparse: fattr->cf_symlink_target = data->symlink_target; data->symlink_target = NULL; } - sid_to_id(cifs_sb, owner, fattr, SIDOWNER); - sid_to_id(cifs_sb, group, fattr, SIDGROUP); + sid_to_id(cifs_sb, &data->posix_owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, &data->posix_group, fattr, SIDGROUP); cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n", fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); @@ -1070,9 +1068,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, - struct cifs_fattr *fattr, - struct cifs_sid *owner, - struct cifs_sid *group) + struct cifs_fattr *fattr) { struct TCP_Server_Info *server = tcon->ses->server; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -1117,7 +1113,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, } if (tcon->posix_extensions) - smb311_posix_info_to_fattr(fattr, data, owner, group, sb); + smb311_posix_info_to_fattr(fattr, data, sb); else cifs_open_info_to_fattr(fattr, data, sb); out: @@ -1171,8 +1167,7 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, */ if (cifs_open_data_reparse(data)) { rc = reparse_info_to_fattr(data, sb, xid, tcon, - full_path, fattr, - NULL, NULL); + full_path, fattr); } else { cifs_open_info_to_fattr(fattr, data, sb); } @@ -1317,10 +1312,10 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, const unsigned int xid) { struct cifs_open_info_data tmp_data = {}; + struct TCP_Server_Info *server; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct tcon_link *tlink; - struct cifs_sid owner, group; int tmprc; int rc = 0; @@ -1328,14 +1323,14 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; /* * 1. Fetch file metadata if not provided (data) */ if (!data) { - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, - full_path, &tmp_data, - &owner, &group); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, + full_path, &tmp_data); data = &tmp_data; } @@ -1347,11 +1342,9 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, case 0: if (cifs_open_data_reparse(data)) { rc = reparse_info_to_fattr(data, sb, xid, tcon, - full_path, fattr, - &owner, &group); + full_path, fattr); } else { - smb311_posix_info_to_fattr(fattr, data, - &owner, &group, sb); + smb311_posix_info_to_fattr(fattr, data, sb); } break; case -EREMOTE: diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index c2137ea3c253..0748d7b757b9 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -140,6 +140,7 @@ tcon_info_alloc(bool dir_leases_enabled) spin_lock_init(&ret_buf->stat_lock); atomic_set(&ret_buf->num_local_opens, 0); atomic_set(&ret_buf->num_remote_opens, 0); + ret_buf->stats_from_time = ktime_get_real_seconds(); #ifdef CONFIG_CIFS_DFS_UPCALL INIT_LIST_HEAD(&ret_buf->dfs_ses_list); #endif diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 056cae1ddcce..94255401b38d 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -133,14 +133,14 @@ retry: * Query dir responses don't provide enough * information about reparse points other than * their reparse tags. Save an invalidation by - * not clobbering the existing mode, size and - * symlink target (if any) when reparse tag and - * ctime haven't changed. + * not clobbering some existing attributes when + * reparse tag and ctime haven't changed. */ rc = 0; if (fattr->cf_cifsattrs & ATTR_REPARSE) { if (likely(reparse_inode_match(inode, fattr))) { fattr->cf_mode = inode->i_mode; + fattr->cf_rdev = inode->i_rdev; fattr->cf_eof = CIFS_I(inode)->server_eof; fattr->cf_symlink_target = NULL; } else { @@ -645,10 +645,10 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) static int is_dir_changed(struct file *file) { struct inode *inode = file_inode(file); - struct cifsInodeInfo *cifsInfo = CIFS_I(inode); + struct cifsInodeInfo *cifs_inode_info = CIFS_I(inode); - if (cifsInfo->time == 0) - return 1; /* directory was changed, perhaps due to unlink */ + if (cifs_inode_info->time == 0) + return 1; /* directory was changed, e.g. unlink or new file */ else return 0; diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 5053a5550abe..a652200540c8 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -56,6 +56,35 @@ static inline __u32 file_create_options(struct dentry *dentry) return 0; } +/* Parse owner and group from SMB3.1.1 POSIX query info */ +static int parse_posix_sids(struct cifs_open_info_data *data, + struct kvec *rsp_iov) +{ + struct smb2_query_info_rsp *qi = rsp_iov->iov_base; + unsigned int out_len = le32_to_cpu(qi->OutputBufferLength); + unsigned int qi_len = sizeof(data->posix_fi); + int owner_len, group_len; + u8 *sidsbuf, *sidsbuf_end; + + if (out_len <= qi_len) + return -EINVAL; + + sidsbuf = (u8 *)qi + le16_to_cpu(qi->OutputBufferOffset) + qi_len; + sidsbuf_end = sidsbuf + out_len - qi_len; + + owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); + if (owner_len == -1) + return -EINVAL; + + memcpy(&data->posix_owner, sidsbuf, owner_len); + group_len = posix_info_sid_size(sidsbuf + owner_len, sidsbuf_end); + if (group_len == -1) + return -EINVAL; + + memcpy(&data->posix_group, sidsbuf + owner_len, group_len); + return 0; +} + /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. @@ -69,7 +98,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, __u32 desired_access, __u32 create_disposition, __u32 create_options, umode_t mode, struct kvec *in_iov, int *cmds, int num_cmds, struct cifsFileInfo *cfile, - __u8 **extbuf, size_t *extbuflen, struct kvec *out_iov, int *out_buftype) { @@ -494,21 +522,9 @@ finished: &rsp_iov[i + 1], sizeof(idata->posix_fi) /* add SIDs */, (char *)&idata->posix_fi); } - if (rc == 0) { - unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength); - - if (length > sizeof(idata->posix_fi)) { - char *base = (char *)rsp_iov[i + 1].iov_base + - le16_to_cpu(qi_rsp->OutputBufferOffset) + - sizeof(idata->posix_fi); - *extbuflen = length - sizeof(idata->posix_fi); - *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL); - if (!*extbuf) - rc = -ENOMEM; - } else { - rc = -EINVAL; - } - } + if (rc == 0) + rc = parse_posix_sids(idata, &rsp_iov[i + 1]); + SMB2_query_info_free(&rqst[num_rqst++]); if (rc) trace_smb3_posix_query_info_compound_err(xid, ses->Suid, @@ -662,7 +678,7 @@ int smb2_query_path_info(const unsigned int xid, struct smb2_hdr *hdr; struct kvec in_iov[2], out_iov[3] = {}; int out_buftype[3] = {}; - int cmds[2] = { SMB2_OP_QUERY_INFO, }; + int cmds[2]; bool islink; int i, num_cmds; int rc, rc2; @@ -670,20 +686,36 @@ int smb2_query_path_info(const unsigned int xid, data->adjust_tz = false; data->reparse_point = false; - if (strcmp(full_path, "")) - rc = -ENOENT; - else - rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid); - /* If it is a root and its handle is cached then use it */ - if (!rc) { - if (cfid->file_all_info_is_valid) { - memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi)); + /* + * BB TODO: Add support for using cached root handle in SMB3.1.1 POSIX. + * Create SMB2_query_posix_info worker function to do non-compounded + * query when we already have an open file handle for this. For now this + * is fast enough (always using the compounded version). + */ + if (!tcon->posix_extensions) { + if (*full_path) { + rc = -ENOENT; } else { - rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid, - cfid->fid.volatile_fid, &data->fi); + rc = open_cached_dir(xid, tcon, full_path, + cifs_sb, false, &cfid); + } + /* If it is a root and its handle is cached then use it */ + if (!rc) { + if (cfid->file_all_info_is_valid) { + memcpy(&data->fi, &cfid->file_all_info, + sizeof(data->fi)); + } else { + rc = SMB2_query_info(xid, tcon, + cfid->fid.persistent_fid, + cfid->fid.volatile_fid, + &data->fi); + } + close_cached_dir(cfid); + return rc; } - close_cached_dir(cfid); - return rc; + cmds[0] = SMB2_OP_QUERY_INFO; + } else { + cmds[0] = SMB2_OP_POSIX_QUERY_INFO; } in_iov[0].iov_base = data; @@ -693,9 +725,8 @@ int smb2_query_path_info(const unsigned int xid, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - in_iov, cmds, 1, cfile, - NULL, NULL, out_iov, out_buftype); + create_options, ACL_NO_MODE, in_iov, + cmds, 1, cfile, out_iov, out_buftype); hdr = out_iov[0].iov_base; /* * If first iov is unset, then SMB session was dropped or we've got a @@ -707,6 +738,10 @@ int smb2_query_path_info(const unsigned int xid, switch (rc) { case 0: case -EOPNOTSUPP: + /* + * BB TODO: When support for special files added to Samba + * re-verify this path. + */ rc = parse_create_response(data, cifs_sb, &out_iov[0]); if (rc || !data->reparse_point) goto out; @@ -722,8 +757,8 @@ int smb2_query_path_info(const unsigned int xid, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, cmds, - num_cmds, cfile, NULL, NULL, NULL, NULL); + create_options, ACL_NO_MODE, in_iov, + cmds, num_cmds, cfile, NULL, NULL); break; case -EREMOTE: break; @@ -746,101 +781,6 @@ out: return rc; } -int smb311_posix_query_path_info(const unsigned int xid, - struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, - struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group) -{ - int rc; - __u32 create_options = 0; - struct cifsFileInfo *cfile; - struct kvec in_iov[2], out_iov[3] = {}; - int out_buftype[3] = {}; - __u8 *sidsbuf = NULL; - __u8 *sidsbuf_end = NULL; - size_t sidsbuflen = 0; - size_t owner_len, group_len; - int cmds[2] = { SMB2_OP_POSIX_QUERY_INFO, }; - int i, num_cmds; - - data->adjust_tz = false; - data->reparse_point = false; - - /* - * BB TODO: Add support for using the cached root handle. - * Create SMB2_query_posix_info worker function to do non-compounded query - * when we already have an open file handle for this. For now this is fast enough - * (always using the compounded version). - */ - in_iov[0].iov_base = data; - in_iov[0].iov_len = sizeof(*data); - in_iov[1] = in_iov[0]; - - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, cmds, 1, - cfile, &sidsbuf, &sidsbuflen, out_iov, out_buftype); - /* - * If first iov is unset, then SMB session was dropped or we've got a - * cached open file (@cfile). - */ - if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER) - goto out; - - switch (rc) { - case 0: - case -EOPNOTSUPP: - /* BB TODO: When support for special files added to Samba re-verify this path */ - rc = parse_create_response(data, cifs_sb, &out_iov[0]); - if (rc || !data->reparse_point) - goto out; - - if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) { - /* symlink already parsed in create response */ - num_cmds = 1; - } else { - cmds[1] = SMB2_OP_GET_REPARSE; - num_cmds = 2; - } - create_options |= OPEN_REPARSE_POINT; - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, cmds, - num_cmds, cfile, &sidsbuf, &sidsbuflen, NULL, NULL); - break; - } - -out: - if (rc == 0) { - sidsbuf_end = sidsbuf + sidsbuflen; - - owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); - if (owner_len == -1) { - rc = -EINVAL; - goto out; - } - memcpy(owner, sidsbuf, owner_len); - - group_len = posix_info_sid_size( - sidsbuf + owner_len, sidsbuf_end); - if (group_len == -1) { - rc = -EINVAL; - goto out; - } - memcpy(group, sidsbuf + owner_len, group_len); - } - - kfree(sidsbuf); - for (i = 0; i < ARRAY_SIZE(out_buftype); i++) - free_rsp_buf(out_buftype[i], out_iov[i].iov_base); - return rc; -} - int smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, struct cifs_tcon *tcon, const char *name, @@ -848,9 +788,9 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, mode, NULL, - &(int){SMB2_OP_MKDIR}, 1, - NULL, NULL, NULL, NULL, NULL); + CREATE_NOT_FILE, mode, + NULL, &(int){SMB2_OP_MKDIR}, 1, + NULL, NULL, NULL); } void @@ -875,7 +815,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, &in_iov, &(int){SMB2_OP_SET_INFO}, 1, - cfile, NULL, NULL, NULL, NULL); + cfile, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -887,8 +827,9 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, drop_cached_dir_by_name(xid, tcon, name, cifs_sb); return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, - ACL_NO_MODE, NULL, &(int){SMB2_OP_RMDIR}, 1, - NULL, NULL, NULL, NULL, NULL); + ACL_NO_MODE, NULL, + &(int){SMB2_OP_RMDIR}, 1, + NULL, NULL, NULL); } int @@ -897,8 +838,9 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, NULL, NULL); + ACL_NO_MODE, NULL, + &(int){SMB2_OP_DELETE}, 1, + NULL, NULL, NULL); } static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, @@ -919,8 +861,8 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, in_iov.iov_base = smb2_to_name; in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX); rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, create_options, ACL_NO_MODE, &in_iov, - &command, 1, cfile, NULL, NULL, NULL, NULL); + FILE_OPEN, create_options, ACL_NO_MODE, + &in_iov, &command, 1, cfile, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -971,7 +913,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, &in_iov, &(int){SMB2_OP_SET_EOF}, 1, - cfile, NULL, NULL, NULL, NULL); + cfile, NULL, NULL); } int @@ -999,8 +941,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, ACL_NO_MODE, &in_iov, - &(int){SMB2_OP_SET_INFO}, 1, cfile, - NULL, NULL, NULL, NULL); + &(int){SMB2_OP_SET_INFO}, 1, + cfile, NULL, NULL); cifs_put_tlink(tlink); return rc; } @@ -1035,7 +977,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, da, cd, co, ACL_NO_MODE, in_iov, - cmds, 2, cfile, NULL, NULL, NULL, NULL); + cmds, 2, cfile, NULL, NULL); if (!rc) { rc = smb311_posix_get_inode_info(&new, full_path, data, sb, xid); @@ -1045,7 +987,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, da, cd, co, ACL_NO_MODE, in_iov, - cmds, 2, cfile, NULL, NULL, NULL, NULL); + cmds, 2, cfile, NULL, NULL); if (!rc) { rc = cifs_get_inode_info(&new, full_path, data, sb, xid, NULL); @@ -1072,8 +1014,8 @@ int smb2_query_reparse_point(const unsigned int xid, rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov, - &(int){SMB2_OP_GET_REPARSE}, 1, cfile, - NULL, NULL, NULL, NULL); + &(int){SMB2_OP_GET_REPARSE}, 1, + cfile, NULL, NULL); if (rc) goto out; diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index 1a90dd78b238..ac1895358908 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -1210,6 +1210,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"}, {STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"}, {STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"}, + {STATUS_SERVER_UNAVAILABLE, -EAGAIN, "STATUS_SERVER_UNAVAILABLE"}, + {STATUS_FILE_NOT_AVAILABLE, -EAGAIN, "STATUS_FILE_NOT_AVAILABLE"}, {STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"}, {STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"}, {STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"}, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 01a5bd7e6a30..d9553c2556a2 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -614,7 +614,8 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, "multichannel not available\n" "Empty network interface list returned by server %s\n", ses->server->hostname); - rc = -EINVAL; + rc = -EOPNOTSUPP; + ses->iface_last_update = jiffies; goto out; } @@ -712,7 +713,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, ses->iface_count++; spin_unlock(&ses->iface_lock); - ses->iface_last_update = jiffies; next_iface: nb_iface++; next = le32_to_cpu(p->Next); @@ -734,11 +734,7 @@ next_iface: if ((bytes_left > 8) || p->Next) cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); - - if (!ses->iface_count) { - rc = -EINVAL; - goto out; - } + ses->iface_last_update = jiffies; out: /* diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index bd25c34dc398..288199f0b987 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -156,6 +156,57 @@ out: return; } +/* helper function for code reuse */ +static int +cifs_chan_skip_or_disable(struct cifs_ses *ses, + struct TCP_Server_Info *server, + bool from_reconnect) +{ + struct TCP_Server_Info *pserver; + unsigned int chan_index; + + if (SERVER_IS_CHAN(server)) { + cifs_dbg(VFS, + "server %s does not support multichannel anymore. Skip secondary channel\n", + ses->server->hostname); + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + goto skip_terminate; + } + + ses->chans[chan_index].server = NULL; + spin_unlock(&ses->chan_lock); + + /* + * the above reference of server by channel + * needs to be dropped without holding chan_lock + * as cifs_put_tcp_session takes a higher lock + * i.e. cifs_tcp_ses_lock + */ + cifs_put_tcp_session(server, from_reconnect); + + server->terminate = true; + cifs_signal_cifsd_for_reconnect(server, false); + + /* mark primary server as needing reconnect */ + pserver = server->primary_server; + cifs_signal_cifsd_for_reconnect(pserver, false); +skip_terminate: + mutex_unlock(&ses->session_mutex); + return -EHOSTDOWN; + } + + cifs_server_dbg(VFS, + "server does not support multichannel anymore. Disable all other channels\n"); + cifs_disable_secondary_channels(ses); + + + return 0; +} + static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server, bool from_reconnect) @@ -164,8 +215,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct nls_table *nls_codepage = NULL; struct cifs_ses *ses; int xid; - struct TCP_Server_Info *pserver; - unsigned int chan_index; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -310,44 +359,11 @@ again: */ if (ses->chan_count > 1 && !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - if (SERVER_IS_CHAN(server)) { - cifs_dbg(VFS, "server %s does not support " \ - "multichannel anymore. skipping secondary channel\n", - ses->server->hostname); - - spin_lock(&ses->chan_lock); - chan_index = cifs_ses_get_chan_index(ses, server); - if (chan_index == CIFS_INVAL_CHAN_INDEX) { - spin_unlock(&ses->chan_lock); - goto skip_terminate; - } - - ses->chans[chan_index].server = NULL; - spin_unlock(&ses->chan_lock); - - /* - * the above reference of server by channel - * needs to be dropped without holding chan_lock - * as cifs_put_tcp_session takes a higher lock - * i.e. cifs_tcp_ses_lock - */ - cifs_put_tcp_session(server, from_reconnect); - - server->terminate = true; - cifs_signal_cifsd_for_reconnect(server, false); - - /* mark primary server as needing reconnect */ - pserver = server->primary_server; - cifs_signal_cifsd_for_reconnect(pserver, false); - -skip_terminate: + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + if (rc) { mutex_unlock(&ses->session_mutex); - rc = -EHOSTDOWN; goto out; - } else { - cifs_server_dbg(VFS, "does not support " \ - "multichannel anymore. disabling all other channels\n"); - cifs_disable_secondary_channels(ses); } } @@ -395,20 +411,35 @@ skip_sess_setup: rc = SMB3_request_interfaces(xid, tcon, false); free_xid(xid); - if (rc) + if (rc == -EOPNOTSUPP) { + /* + * some servers like Azure SMB server do not advertise + * that multichannel has been disabled with server + * capabilities, rather return STATUS_NOT_IMPLEMENTED. + * treat this as server not supporting multichannel + */ + + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + goto skip_add_channels; + } else if (rc) cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); if (ses->chan_max > ses->chan_count && + ses->iface_count && !SERVER_IS_CHAN(server)) { if (ses->chan_count == 1) cifs_server_dbg(VFS, "supports multichannel now\n"); cifs_try_adding_channels(ses); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); } } else { mutex_unlock(&ses->session_mutex); } +skip_add_channels: if (smb2_command != SMB2_INTERNAL_CMD) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -1958,10 +1989,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, __le16 *unc_path = NULL; int flags = 0; unsigned int total_len; - struct TCP_Server_Info *server; - - /* always use master channel */ - server = ses->server; + struct TCP_Server_Info *server = cifs_pick_channel(ses); cifs_dbg(FYI, "TCON\n"); @@ -2094,6 +2122,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) struct smb2_tree_disconnect_req *req; /* response is trivial */ int rc = 0; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = 0; unsigned int total_len; struct kvec iov[1]; @@ -2116,7 +2145,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) invalidate_all_cached_dirs(tcon); - rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server, + rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, server, (void **) &req, &total_len); if (rc) @@ -2134,7 +2163,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, ses->server, + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -2279,7 +2308,7 @@ int smb2_parse_contexts(struct TCP_Server_Info *server, noff = le16_to_cpu(cc->NameOffset); nlen = le16_to_cpu(cc->NameLength); - if (noff + nlen >= doff) + if (noff + nlen > doff) return -EINVAL; name = (char *)cc + noff; @@ -3918,7 +3947,7 @@ void smb2_reconnect_server(struct work_struct *work) struct cifs_ses *ses, *ses2; struct cifs_tcon *tcon, *tcon2; struct list_head tmp_list, tmp_ses_list; - bool tcon_exist = false, ses_exist = false; + bool ses_exist = false; bool tcon_selected = false; int rc; bool resched = false; @@ -3964,7 +3993,7 @@ void smb2_reconnect_server(struct work_struct *work) if (tcon->need_reconnect || tcon->need_reopen_files) { tcon->tc_count++; list_add_tail(&tcon->rlist, &tmp_list); - tcon_selected = tcon_exist = true; + tcon_selected = true; } } /* @@ -3973,7 +4002,7 @@ void smb2_reconnect_server(struct work_struct *work) */ if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); - tcon_selected = tcon_exist = true; + tcon_selected = true; cifs_smb_ses_inc_refcount(ses); } /* diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 343ada691e76..0034b537b0b3 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -299,9 +299,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group); + struct cifs_open_info_data *data); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); diff --git a/fs/smb/client/smb2status.h b/fs/smb/client/smb2status.h index a9e958166fc5..9c6d79b0bd49 100644 --- a/fs/smb/client/smb2status.h +++ b/fs/smb/client/smb2status.h @@ -982,6 +982,8 @@ struct ntstatus { #define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501) #define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502) #define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503) +#define STATUS_SERVER_UNAVAILABLE cpu_to_le32(0xC0000466) +#define STATUS_FILE_NOT_AVAILABLE cpu_to_le32(0xC0000467) #define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700) #define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701) #define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702) diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c index 4a4b2b03ff33..b931a99ab9c8 100644 --- a/fs/smb/server/asn1.c +++ b/fs/smb/server/asn1.c @@ -214,10 +214,15 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, { struct ksmbd_conn *conn = context; + if (!vlen) + return -EINVAL; + conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL); if (!conn->mechToken) return -ENOMEM; + conn->mechTokenLen = (unsigned int)vlen; + return 0; } diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index d311c2ee10bd..09e1e7771592 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -416,13 +416,7 @@ static void stop_sessions(void) again: down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { - struct task_struct *task; - t = conn->transport; - task = t->handler; - if (task) - ksmbd_debug(CONN, "Stop session handler %s/%d\n", - task->comm, task_pid_nr(task)); ksmbd_conn_set_exiting(conn); if (t->ops->shutdown) { up_read(&conn_list_lock); diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 3c005246a32e..0e04cf8b1d89 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -88,6 +88,7 @@ struct ksmbd_conn { __u16 dialect; char *mechToken; + unsigned int mechTokenLen; struct ksmbd_conn_ops *conn_ops; @@ -134,7 +135,6 @@ struct ksmbd_transport_ops { struct ksmbd_transport { struct ksmbd_conn *conn; struct ksmbd_transport_ops *ops; - struct task_struct *handler; }; #define KSMBD_TCP_RECV_TIMEOUT (7 * HZ) diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 001926d3b348..53dfaac425c6 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1197,6 +1197,12 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, bool prev_op_has_lease; __le32 prev_op_state = 0; + /* Only v2 leases handle the directory */ + if (S_ISDIR(file_inode(fp->filp)->i_mode)) { + if (!lctx || lctx->version != 2) + return 0; + } + opinfo = alloc_opinfo(work, pid, tid); if (!opinfo) return -ENOMEM; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 3143819935dc..ba7a72a6a4f4 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1414,7 +1414,10 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn, char *name; unsigned int name_off, name_len, secbuf_len; - secbuf_len = le16_to_cpu(req->SecurityBufferLength); + if (conn->use_spnego && conn->mechToken) + secbuf_len = conn->mechTokenLen; + else + secbuf_len = le16_to_cpu(req->SecurityBufferLength); if (secbuf_len < sizeof(struct authenticate_message)) { ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len); return NULL; @@ -1505,7 +1508,10 @@ static int ntlm_authenticate(struct ksmbd_work *work, struct authenticate_message *authblob; authblob = user_authblob(conn, req); - sz = le16_to_cpu(req->SecurityBufferLength); + if (conn->use_spnego && conn->mechToken) + sz = conn->mechTokenLen; + else + sz = le16_to_cpu(req->SecurityBufferLength); rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess); if (rc) { set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD); @@ -1778,8 +1784,7 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off = le16_to_cpu(req->SecurityBufferOffset); negblob_len = le16_to_cpu(req->SecurityBufferLength); - if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) || - negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer)) { rc = -EINVAL; goto out_err; } @@ -1788,8 +1793,15 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off); if (decode_negotiation_token(conn, negblob, negblob_len) == 0) { - if (conn->mechToken) + if (conn->mechToken) { negblob = (struct negotiate_message *)conn->mechToken; + negblob_len = conn->mechTokenLen; + } + } + + if (negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + rc = -EINVAL; + goto out_err; } if (server_conf.auth_mechs & conn->auth_mechs) { diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c5629a68c8b7..8faa25c6e129 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2039,6 +2039,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) { struct smb_direct_transport *t; + struct task_struct *handler; int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { @@ -2056,11 +2057,11 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (ret) goto out_err; - KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, "ksmbd:r%u", - smb_direct_port); - if (IS_ERR(KSMBD_TRANS(t)->handler)) { - ret = PTR_ERR(KSMBD_TRANS(t)->handler); + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, "ksmbd:r%u", + smb_direct_port); + if (IS_ERR(handler)) { + ret = PTR_ERR(handler); pr_err("Can't start thread\n"); goto out_err; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index eff7a1d793f0..9d4222154dcc 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -185,6 +185,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) struct sockaddr *csin; int rc = 0; struct tcp_transport *t; + struct task_struct *handler; t = alloc_transport(client_sk); if (!t) { @@ -199,13 +200,13 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) goto out_error; } - KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, - "ksmbd:%u", - ksmbd_tcp_get_port(csin)); - if (IS_ERR(KSMBD_TRANS(t)->handler)) { + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, + "ksmbd:%u", + ksmbd_tcp_get_port(csin)); + if (IS_ERR(handler)) { pr_err("cannot start conn thread\n"); - rc = PTR_ERR(KSMBD_TRANS(t)->handler); + rc = PTR_ERR(handler); free_transport(t); } return rc; diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 6795fda2af19..6b211522a13e 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -34,7 +34,15 @@ static DEFINE_MUTEX(eventfs_mutex); /* Choose something "unique" ;-) */ #define EVENTFS_FILE_INODE_INO 0x12c4e37 -#define EVENTFS_DIR_INODE_INO 0x134b2f5 + +/* Just try to make something consistent and unique */ +static int eventfs_dir_ino(struct eventfs_inode *ei) +{ + if (!ei->ino) + ei->ino = get_next_ino(); + + return ei->ino; +} /* * The eventfs_inode (ei) itself is protected by SRCU. It is released from @@ -396,7 +404,7 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent inode->i_fop = &eventfs_file_operations; /* All directories will have the same inode number */ - inode->i_ino = EVENTFS_DIR_INODE_INO; + inode->i_ino = eventfs_dir_ino(ei); ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; @@ -802,7 +810,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) name = ei_child->name; - ino = EVENTFS_DIR_INODE_INO; + ino = eventfs_dir_ino(ei_child); if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) goto out_dec; diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 12b7d0150ae9..45397df9bb65 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -55,6 +55,10 @@ struct eventfs_inode { struct eventfs_attr *entry_attrs; struct eventfs_attr attr; void *data; + unsigned int is_freed:1; + unsigned int is_events:1; + unsigned int nr_entries:30; + unsigned int ino; /* * Union - used for deletion * @llist: for calling dput() if needed after RCU @@ -64,9 +68,6 @@ struct eventfs_inode { struct llist_node llist; struct rcu_head rcu; }; - unsigned int is_freed:1; - unsigned int is_events:1; - unsigned int nr_entries:30; }; static inline struct tracefs_inode *get_tracefs(const struct inode *inode) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 98aaca933bdd..f362345467fa 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3277,7 +3277,7 @@ xfs_bmap_alloc_account( struct xfs_bmalloca *ap) { bool isrt = XFS_IS_REALTIME_INODE(ap->ip) && - (ap->flags & XFS_BMAPI_ATTRFORK); + !(ap->flags & XFS_BMAPI_ATTRFORK); uint fld; if (ap->flags & XFS_BMAPI_COWFORK) { |