diff options
Diffstat (limited to 'fs')
479 files changed, 11922 insertions, 36938 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 731e3d14b67d..0e8418066a48 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -42,6 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_free_inode(struct inode *inode); struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev); +void v9fs_set_netfs_context(struct inode *inode); int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t rdev); void v9fs_evict_inode(struct inode *inode); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 8a635999a7d6..047855033d32 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -19,12 +19,45 @@ #include <linux/netfs.h> #include <net/9p/9p.h> #include <net/9p/client.h> +#include <trace/events/netfs.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "cache.h" #include "fid.h" +static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq) +{ + struct p9_fid *fid = subreq->rreq->netfs_priv; + int err, len; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + netfs_write_subrequest_terminated(subreq, len ?: err, false); +} + +static void v9fs_upload_to_server_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); + + v9fs_upload_to_server(subreq); +} + +/* + * Set up write requests for a writeback slice. We need to add a write request + * for each write we want to make. + */ +static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER, + start, len, v9fs_upload_to_server_worker); + if (subreq) + netfs_queue_write_request(subreq); +} + /** * v9fs_issue_read - Issue a read from 9P * @subreq: The read to make @@ -33,14 +66,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct p9_fid *fid = rreq->netfs_priv; - struct iov_iter to; - loff_t pos = subreq->start + subreq->transferred; - size_t len = subreq->len - subreq->transferred; int total, err; - iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len); - - total = p9_client_read(fid, pos, &to, &err); + total = p9_client_read(fid, subreq->start + subreq->transferred, + &subreq->io_iter, &err); /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ @@ -50,25 +79,42 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) } /** - * v9fs_init_request - Initialise a read request + * v9fs_init_request - Initialise a request * @rreq: The read request * @file: The file being read from */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { - struct p9_fid *fid = file->private_data; - - BUG_ON(!fid); + struct p9_fid *fid; + bool writing = (rreq->origin == NETFS_READ_FOR_WRITE || + rreq->origin == NETFS_WRITEBACK || + rreq->origin == NETFS_WRITETHROUGH || + rreq->origin == NETFS_LAUNDER_WRITE || + rreq->origin == NETFS_UNBUFFERED_WRITE || + rreq->origin == NETFS_DIO_WRITE); + + if (file) { + fid = file->private_data; + if (!fid) + goto no_fid; + p9_fid_get(fid); + } else { + fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true); + if (!fid) + goto no_fid; + } /* we might need to read from a fid that was opened write-only * for read-modify-write of page cache, use the writeback fid * for that */ - WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && - !(fid->mode & P9_ORDWR)); - - p9_fid_get(fid); + WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR)); rreq->netfs_priv = fid; return 0; + +no_fid: + WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n", + rreq->inode->i_ino); + return -EINVAL; } /** @@ -82,281 +128,20 @@ static void v9fs_free_request(struct netfs_io_request *rreq) p9_fid_put(fid); } -/** - * v9fs_begin_cache_operation - Begin a cache operation for a read - * @rreq: The read request - */ -static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) -{ -#ifdef CONFIG_9P_FSCACHE - struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); - - return fscache_begin_read_operation(&rreq->cache_resources, cookie); -#else - return -ENOBUFS; -#endif -} - const struct netfs_request_ops v9fs_req_ops = { .init_request = v9fs_init_request, .free_request = v9fs_free_request, - .begin_cache_operation = v9fs_begin_cache_operation, .issue_read = v9fs_issue_read, + .create_write_requests = v9fs_create_write_requests, }; -/** - * v9fs_release_folio - release the private state associated with a folio - * @folio: The folio to be released - * @gfp: The caller's allocation restrictions - * - * Returns true if the page can be released, false otherwise. - */ - -static bool v9fs_release_folio(struct folio *folio, gfp_t gfp) -{ - if (folio_test_private(folio)) - return false; -#ifdef CONFIG_9P_FSCACHE - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio)))); -#endif - return true; -} - -static void v9fs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) -{ - folio_wait_fscache(folio); -} - -#ifdef CONFIG_9P_FSCACHE -static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct v9fs_inode *v9inode = priv; - __le32 version; - - if (IS_ERR_VALUE(transferred_or_error) && - transferred_or_error != -ENOBUFS) { - version = cpu_to_le32(v9inode->qid.version); - fscache_invalidate(v9fs_inode_cookie(v9inode), &version, - i_size_read(&v9inode->netfs.inode), 0); - } -} -#endif - -static int v9fs_vfs_write_folio_locked(struct folio *folio) -{ - struct inode *inode = folio_inode(folio); - loff_t start = folio_pos(folio); - loff_t i_size = i_size_read(inode); - struct iov_iter from; - size_t len = folio_size(folio); - struct p9_fid *writeback_fid; - int err; - struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); - struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode); - - if (start >= i_size) - return 0; /* Simultaneous truncation occurred */ - - len = min_t(loff_t, i_size - start, len); - - iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len); - - writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true); - if (!writeback_fid) { - WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n", - inode->i_private); - return -EINVAL; - } - - folio_wait_fscache(folio); - folio_start_writeback(folio); - - p9_client_write(writeback_fid, start, &from, &err); - -#ifdef CONFIG_9P_FSCACHE - if (err == 0 && - fscache_cookie_enabled(cookie) && - test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { - folio_start_fscache(folio); - fscache_write_to_cache(v9fs_inode_cookie(v9inode), - folio_mapping(folio), start, len, i_size, - v9fs_write_to_cache_done, v9inode, - true); - } -#endif - - folio_end_writeback(folio); - p9_fid_put(writeback_fid); - - return err; -} - -static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int retval; - - p9_debug(P9_DEBUG_VFS, "folio %p\n", folio); - - retval = v9fs_vfs_write_folio_locked(folio); - if (retval < 0) { - if (retval == -EAGAIN) { - folio_redirty_for_writepage(wbc, folio); - retval = 0; - } else { - mapping_set_error(folio_mapping(folio), retval); - } - } else - retval = 0; - - folio_unlock(folio); - return retval; -} - -static int v9fs_launder_folio(struct folio *folio) -{ - int retval; - - if (folio_clear_dirty_for_io(folio)) { - retval = v9fs_vfs_write_folio_locked(folio); - if (retval) - return retval; - } - folio_wait_fscache(folio); - return 0; -} - -/** - * v9fs_direct_IO - 9P address space operation for direct I/O - * @iocb: target I/O control block - * @iter: The data/buffer to use - * - * The presence of v9fs_direct_IO() in the address space ops vector - * allowes open() O_DIRECT flags which would have failed otherwise. - * - * In the non-cached mode, we shunt off direct read and write requests before - * the VFS gets them, so this method should never be called. - * - * Direct IO is not 'yet' supported in the cached mode. Hence when - * this routine is called through generic_file_aio_read(), the read/write fails - * with an error. - * - */ -static ssize_t -v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; - ssize_t n; - int err = 0; - - if (iov_iter_rw(iter) == WRITE) { - n = p9_client_write(file->private_data, pos, iter, &err); - if (n) { - struct inode *inode = file_inode(file); - loff_t i_size = i_size_read(inode); - - if (pos + n > i_size) - inode_add_bytes(inode, pos + n - i_size); - } - } else { - n = p9_client_read(file->private_data, pos, iter, &err); - } - return n ? n : err; -} - -static int v9fs_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, - struct page **subpagep, void **fsdata) -{ - int retval; - struct folio *folio; - struct v9fs_inode *v9inode = V9FS_I(mapping->host); - - p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - - /* Prefetch area to be written into the cache if we're caching this - * file. We need to do this before we get a lock on the page in case - * there's more than one writer competing for the same cache block. - */ - retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata); - if (retval < 0) - return retval; - - *subpagep = &folio->page; - return retval; -} - -static int v9fs_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, - struct page *subpage, void *fsdata) -{ - loff_t last_pos = pos + copied; - struct folio *folio = page_folio(subpage); - struct inode *inode = mapping->host; - - p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - - if (!folio_test_uptodate(folio)) { - if (unlikely(copied < len)) { - copied = 0; - goto out; - } - - folio_mark_uptodate(folio); - } - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_mutex. - */ - if (last_pos > inode->i_size) { - inode_add_bytes(inode, last_pos - inode->i_size); - i_size_write(inode, last_pos); -#ifdef CONFIG_9P_FSCACHE - fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL, - &last_pos); -#endif - } - folio_mark_dirty(folio); -out: - folio_unlock(folio); - folio_put(folio); - - return copied; -} - -#ifdef CONFIG_9P_FSCACHE -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - struct v9fs_inode *v9inode = V9FS_I(mapping->host); - - return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode)); -} -#else -#define v9fs_dirty_folio filemap_dirty_folio -#endif - const struct address_space_operations v9fs_addr_operations = { - .read_folio = netfs_read_folio, - .readahead = netfs_readahead, - .dirty_folio = v9fs_dirty_folio, - .writepage = v9fs_vfs_writepage, - .write_begin = v9fs_write_begin, - .write_end = v9fs_write_end, - .release_folio = v9fs_release_folio, - .invalidate_folio = v9fs_invalidate_folio, - .launder_folio = v9fs_launder_folio, - .direct_IO = v9fs_direct_IO, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, + .dirty_folio = netfs_dirty_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, + .launder_folio = netfs_launder_folio, + .direct_IO = noop_direct_IO, + .writepages = netfs_writepages, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 11cd8d23f6f2..abdbbaee5184 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -107,7 +107,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } @@ -121,13 +121,12 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) struct p9_fid *fid; uint8_t status = P9_LOCK_ERROR; int res = 0; - unsigned char fl_type; struct v9fs_session_info *v9ses; fid = filp->private_data; BUG_ON(fid == NULL); - BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX); + BUG_ON((fl->c.flc_flags & FL_POSIX) != FL_POSIX); res = locks_lock_file_wait(filp, fl); if (res < 0) @@ -136,7 +135,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) /* convert posix lock to p9 tlock args */ memset(&flock, 0, sizeof(flock)); /* map the lock type */ - switch (fl->fl_type) { + switch (fl->c.flc_type) { case F_RDLCK: flock.type = P9_LOCK_TYPE_RDLCK; break; @@ -152,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) flock.length = 0; else flock.length = fl->fl_end - fl->fl_start + 1; - flock.proc_id = fl->fl_pid; + flock.proc_id = fl->c.flc_pid; flock.client_id = fid->clnt->name; if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; @@ -207,12 +206,13 @@ out_unlock: * incase server returned error for lock request, revert * it locally */ - if (res < 0 && fl->fl_type != F_UNLCK) { - fl_type = fl->fl_type; - fl->fl_type = F_UNLCK; + if (res < 0 && fl->c.flc_type != F_UNLCK) { + unsigned char type = fl->c.flc_type; + + fl->c.flc_type = F_UNLCK; /* Even if this fails we want to return the remote error */ locks_lock_file_wait(filp, fl); - fl->fl_type = fl_type; + fl->c.flc_type = type; } if (flock.client_id != fid->clnt->name) kfree(flock.client_id); @@ -234,7 +234,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) * if we have a conflicting lock locally, no need to validate * with server */ - if (fl->fl_type != F_UNLCK) + if (fl->c.flc_type != F_UNLCK) return res; /* convert posix lock to p9 tgetlock args */ @@ -245,7 +245,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) glock.length = 0; else glock.length = fl->fl_end - fl->fl_start + 1; - glock.proc_id = fl->fl_pid; + glock.proc_id = fl->c.flc_pid; glock.client_id = fid->clnt->name; res = p9_client_getlock_dotl(fid, &glock); @@ -254,13 +254,13 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) /* map 9p lock type to os lock type */ switch (glock.type) { case P9_LOCK_TYPE_RDLCK: - fl->fl_type = F_RDLCK; + fl->c.flc_type = F_RDLCK; break; case P9_LOCK_TYPE_WRLCK: - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; break; case P9_LOCK_TYPE_UNLCK: - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; break; } if (glock.type != P9_LOCK_TYPE_UNLCK) { @@ -269,7 +269,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; - fl->fl_pid = -glock.proc_id; + fl->c.flc_pid = -glock.proc_id; } out: if (glock.client_id != fid->clnt->name) @@ -293,7 +293,7 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } @@ -324,16 +324,16 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) goto out_err; - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } /* Convert flock to posix lock */ - fl->fl_flags |= FL_POSIX; - fl->fl_flags ^= FL_FLOCK; + fl->c.flc_flags |= FL_POSIX; + fl->c.flc_flags ^= FL_FLOCK; if (IS_SETLK(cmd) | IS_SETLKW(cmd)) ret = v9fs_file_do_lock(filp, cmd, fl); @@ -353,25 +353,15 @@ static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct p9_fid *fid = iocb->ki_filp->private_data; - int ret, err = 0; p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", fid->fid, iov_iter_count(to), iocb->ki_pos); - if (!(fid->mode & P9L_DIRECT)) { - p9_debug(P9_DEBUG_VFS, "(cached)\n"); - return generic_file_read_iter(iocb, to); - } - - if (iocb->ki_filp->f_flags & O_NONBLOCK) - ret = p9_client_read_once(fid, iocb->ki_pos, to, &err); - else - ret = p9_client_read(fid, iocb->ki_pos, to, &err); - if (!ret) - return err; + if (fid->mode & P9L_DIRECT) + return netfs_unbuffered_read_iter(iocb, to); - iocb->ki_pos += ret; - return ret; + p9_debug(P9_DEBUG_VFS, "(cached)\n"); + return netfs_file_read_iter(iocb, to); } /* @@ -407,46 +397,14 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct p9_fid *fid = file->private_data; - ssize_t retval; - loff_t origin; - int err = 0; p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid); - if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) { - p9_debug(P9_DEBUG_CACHE, "(cached)\n"); - return generic_file_write_iter(iocb, from); - } - - retval = generic_write_checks(iocb, from); - if (retval <= 0) - return retval; + if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE)) + return netfs_unbuffered_write_iter(iocb, from); - origin = iocb->ki_pos; - retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err); - if (retval > 0) { - struct inode *inode = file_inode(file); - loff_t i_size; - unsigned long pg_start, pg_end; - - pg_start = origin >> PAGE_SHIFT; - pg_end = (origin + retval - 1) >> PAGE_SHIFT; - if (inode->i_mapping && inode->i_mapping->nrpages) - invalidate_inode_pages2_range(inode->i_mapping, - pg_start, pg_end); - iocb->ki_pos += retval; - i_size = i_size_read(inode); - if (iocb->ki_pos > i_size) { - inode_add_bytes(inode, iocb->ki_pos - i_size); - /* - * Need to serialize against i_size_write() in - * v9fs_stat2inode() - */ - v9fs_i_size_write(inode, iocb->ki_pos); - } - return retval; - } - return err; + p9_debug(P9_DEBUG_CACHE, "(cached)\n"); + return netfs_file_write_iter(iocb, from); } static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, @@ -519,36 +477,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) static vm_fault_t v9fs_vm_page_mkwrite(struct vm_fault *vmf) { - struct folio *folio = page_folio(vmf->page); - struct file *filp = vmf->vma->vm_file; - struct inode *inode = file_inode(filp); - - - p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n", - folio, (unsigned long)filp->private_data); - - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. - */ -#ifdef CONFIG_9P_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) - return VM_FAULT_NOPAGE; -#endif - - /* Update file times before taking page lock */ - file_update_time(filp); - - if (folio_lock_killable(folio) < 0) - return VM_FAULT_RETRY; - if (folio_mapping(folio) != inode->i_mapping) - goto out_unlock; - folio_wait_stable(folio); - - return VM_FAULT_LOCKED; -out_unlock: - folio_unlock(folio); - return VM_FAULT_NOPAGE; + return netfs_page_mkwrite(vmf, NULL); } static void v9fs_mmap_vm_close(struct vm_area_struct *vma) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b845ee18a80b..32572982f72e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -246,10 +246,10 @@ void v9fs_free_inode(struct inode *inode) /* * Set parameters for the netfs library */ -static void v9fs_set_netfs_context(struct inode *inode) +void v9fs_set_netfs_context(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); - netfs_inode_init(&v9inode->netfs, &v9fs_req_ops); + netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true); } int v9fs_init_inode(struct v9fs_session_info *v9ses, @@ -326,8 +326,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, err = -EINVAL; goto error; } - - v9fs_set_netfs_context(inode); error: return err; @@ -359,6 +357,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) iput(inode); return ERR_PTR(err); } + v9fs_set_netfs_context(inode); return inode; } @@ -374,11 +373,8 @@ void v9fs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); -#ifdef CONFIG_9P_FSCACHE version = cpu_to_le32(v9inode->qid.version); - fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, - &version); -#endif + netfs_clear_inode_writeback(inode, &version); clear_inode(inode); filemap_fdatawrite(&inode->i_data); @@ -464,6 +460,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, goto error; v9fs_stat2inode(st, inode, sb, 0); + v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; @@ -1113,7 +1110,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - truncate_pagecache(inode, iattr->ia_size); + netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) { @@ -1181,6 +1178,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; + v9inode->netfs.remote_i_size = stat->length; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index c7319af2f471..3505227e1704 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -128,6 +128,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, goto error; v9fs_stat2inode_dotl(st, inode, 0); + v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) @@ -598,7 +599,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - truncate_pagecache(inode, iattr->ia_size); + netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) @@ -655,6 +656,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; + v9inode->netfs.remote_i_size = stat->st_size; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; @@ -683,8 +685,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_mode = mode; } if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && - stat->st_result_mask & P9_STATS_SIZE) + stat->st_result_mask & P9_STATS_SIZE) { + v9inode->netfs.remote_i_size = stat->st_size; v9fs_i_size_write(inode, stat->st_size); + } if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 73db55c050bf..941f7d0e0bfa 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -289,31 +289,21 @@ static int v9fs_drop_inode(struct inode *inode) static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct v9fs_inode *v9inode; - /* * send an fsync request to server irrespective of * wbc->sync_mode. */ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - - v9inode = V9FS_I(inode); - fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); - - return 0; + return netfs_unpin_writeback(inode, wbc); } static int v9fs_write_inode_dotl(struct inode *inode, struct writeback_control *wbc) { - struct v9fs_inode *v9inode; - v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); - - return 0; + return netfs_unpin_writeback(inode, wbc); } static const struct super_operations v9fs_super_ops = { diff --git a/fs/Kconfig b/fs/Kconfig index a3159831ba98..4bc7dd420874 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -144,7 +144,6 @@ source "fs/overlayfs/Kconfig" menu "Caches" source "fs/netfs/Kconfig" -source "fs/fscache/Kconfig" source "fs/cachefiles/Kconfig" endmenu @@ -163,7 +162,6 @@ menu "DOS/FAT/EXFAT/NT Filesystems" source "fs/fat/Kconfig" source "fs/exfat/Kconfig" -source "fs/ntfs/Kconfig" source "fs/ntfs3/Kconfig" endmenu @@ -175,6 +173,13 @@ source "fs/proc/Kconfig" source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" +config FS_PID + bool "Pseudo filesystem for process file descriptors" + depends on 64BIT + default y + help + Pidfs implements advanced features for process file descriptors. + config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM diff --git a/fs/Makefile b/fs/Makefile index a6962c588962..6ecc9b0a53f2 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -15,7 +15,7 @@ obj-y := open.o read_write.o file_table.o super.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o mnt_idmapping.o remap_range.o + kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o @@ -61,7 +61,6 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_NETFS_SUPPORT) += netfs/ -obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT4_FS) += ext4/ # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the @@ -92,7 +91,6 @@ obj-y += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS) += smb/ obj-$(CONFIG_HPFS_FS) += hpfs/ -obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_NTFS3_FS) += ntfs3/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 60685ec76d98..2e612834329a 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -105,6 +105,7 @@ struct affs_sb_info { int work_queued; /* non-zero delayed work is queued */ struct delayed_work sb_work; /* superblock flush delayed work */ spinlock_t work_lock; /* protects sb_work and work_queued */ + struct rcu_head rcu; }; #define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */ diff --git a/fs/affs/super.c b/fs/affs/super.c index 58b391446ae1..b56a95cf414a 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -640,7 +640,7 @@ static void affs_kill_sb(struct super_block *sb) affs_brelse(sbi->s_root_bh); kfree(sbi->s_prefix); mutex_destroy(&sbi->s_bmlock); - kfree(sbi); + kfree_rcu(sbi, rcu); } } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index c14533ef108f..8a67fc427e74 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req) if (xas_retry(&xas, folio)) continue; BUG_ON(xa_is_value(folio)); - ASSERTCMP(folio_file_mapping(folio), ==, mapping); + ASSERTCMP(folio->mapping, ==, mapping); folio_put(folio); } @@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio)); for (offset = 0; offset < size; offset += sizeof(*block)) { block = kmap_local_folio(folio, offset); - pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block); + pr_warn("[%02lx] %32phN\n", folio->index + offset, block); kunmap_local(block); } } @@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) { afs_dir_dump(dvnode, req); @@ -474,6 +474,16 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, continue; } + /* Don't expose silly rename entries to userspace. */ + if (nlen > 6 && + dire->u.name[0] == '.' && + ctx->actor != afs_lookup_filldir && + ctx->actor != afs_lookup_one_filldir && + memcmp(dire->u.name, ".__afs", 6) == 0) { + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); + continue; + } + /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), @@ -708,6 +718,8 @@ static void afs_do_lookup_success(struct afs_operation *op) break; } + if (vp->scb.status.abort_code) + trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code); if (!vp->scb.have_status && !vp->scb.have_error) continue; @@ -897,12 +909,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - inode = ERR_PTR(afs_op_error(op)); out_op: if (!afs_op_error(op)) { - inode = &op->file[1].vnode->netfs.inode; - op->file[1].vnode = NULL; + if (op->file[1].scb.status.abort_code) { + afs_op_accumulate_error(op, -ECONNABORTED, + op->file[1].scb.status.abort_code); + } else { + inode = &op->file[1].vnode->netfs.inode; + op->file[1].vnode = NULL; + } } if (op->file[0].scb.have_status) @@ -2022,7 +2038,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags) { struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio)); + _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index); folio_detach_private(folio); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 2cd40ba601f1..c4d2711e20ad 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); - netfs_inode_init(&vnode->netfs, NULL); + netfs_inode_init(&vnode->netfs, NULL, false); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { @@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; -/* - * Dirs in the dynamic root don't need revalidation. - */ -static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - return 1; -} - const struct dentry_operations afs_dynroot_dentry_operations = { - .d_revalidate = afs_dynroot_d_revalidate, .d_delete = always_delete_dentry, .d_release = afs_d_release, .d_automount = afs_d_automount, diff --git a/fs/afs/file.c b/fs/afs/file.c index 30914e0d9cb2..ef2cc8f565d2 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -20,9 +20,6 @@ static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); static int afs_symlink_read_folio(struct file *file, struct folio *folio); -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length); -static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, @@ -37,7 +34,7 @@ const struct file_operations afs_file_operations = { .release = afs_release, .llseek = generic_file_llseek, .read_iter = afs_file_read_iter, - .write_iter = afs_file_write, + .write_iter = netfs_file_write_iter, .mmap = afs_file_mmap, .splice_read = afs_file_splice_read, .splice_write = iter_file_splice_write, @@ -53,22 +50,21 @@ const struct inode_operations afs_file_inode_operations = { }; const struct address_space_operations afs_file_aops = { + .direct_IO = noop_direct_IO, .read_folio = netfs_read_folio, .readahead = netfs_readahead, - .dirty_folio = afs_dirty_folio, - .launder_folio = afs_launder_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, - .write_begin = afs_write_begin, - .write_end = afs_write_end, - .writepages = afs_writepages, + .dirty_folio = netfs_dirty_folio, + .launder_folio = netfs_launder_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .migrate_folio = filemap_migrate_folio, + .writepages = afs_writepages, }; const struct address_space_operations afs_symlink_aops = { .read_folio = afs_symlink_read_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .migrate_folio = filemap_migrate_folio, }; @@ -323,11 +319,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->len = subreq->len - subreq->transferred; fsreq->key = key_get(subreq->rreq->netfs_priv); fsreq->vnode = vnode; - fsreq->iter = &fsreq->def_iter; - - iov_iter_xarray(&fsreq->def_iter, ITER_DEST, - &fsreq->vnode->netfs.inode.i_mapping->i_pages, - fsreq->pos, fsreq->len); + fsreq->iter = &subreq->io_iter; afs_fetch_data(fsreq->vnode, fsreq); afs_put_read(fsreq); @@ -359,22 +351,13 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio) static int afs_init_request(struct netfs_io_request *rreq, struct file *file) { - rreq->netfs_priv = key_get(afs_file_key(file)); + if (file) + rreq->netfs_priv = key_get(afs_file_key(file)); + rreq->rsize = 256 * 1024; + rreq->wsize = 256 * 1024; return 0; } -static int afs_begin_cache_operation(struct netfs_io_request *rreq) -{ -#ifdef CONFIG_AFS_FSCACHE - struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - - return fscache_begin_read_operation(&rreq->cache_resources, - afs_vnode_cache(vnode)); -#else - return -ENOBUFS; -#endif -} - static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, struct folio **foliop, void **_fsdata) { @@ -388,128 +371,37 @@ static void afs_free_request(struct netfs_io_request *rreq) key_put(rreq->netfs_priv); } -const struct netfs_request_ops afs_req_ops = { - .init_request = afs_init_request, - .free_request = afs_free_request, - .begin_cache_operation = afs_begin_cache_operation, - .check_write_begin = afs_check_write_begin, - .issue_read = afs_issue_read, -}; - -int afs_write_inode(struct inode *inode, struct writeback_control *wbc) +static void afs_update_i_size(struct inode *inode, loff_t new_i_size) { - fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); - return 0; -} - -/* - * Adjust the dirty region of the page on truncation or full invalidation, - * getting rid of the markers altogether if the region is entirely invalidated. - */ -static void afs_invalidate_dirty(struct folio *folio, size_t offset, - size_t length) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - unsigned long priv; - unsigned int f, t, end = offset + length; - - priv = (unsigned long)folio_get_private(folio); - - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == folio_size(folio)) - goto full_invalidate; - - /* If the page was dirtied by page_mkwrite(), the PTE stays writable - * and we don't get another notification to tell us to expand it - * again. - */ - if (afs_is_folio_dirty_mmapped(priv)) - return; - - /* We may need to shorten the dirty region */ - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - - if (t <= offset || f >= end) - return; /* Doesn't overlap */ - - if (f < offset && t > end) - return; /* Splits the dirty region - just absorb it */ - - if (f >= offset && t <= end) - goto undirty; + struct afs_vnode *vnode = AFS_FS_I(inode); + loff_t i_size; - if (f < offset) - t = offset; - else - f = end; - if (f == t) - goto undirty; - - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio); - return; - -undirty: - trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio); - folio_clear_dirty_for_io(folio); -full_invalidate: - trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio); - folio_detach_private(folio); + write_seqlock(&vnode->cb_lock); + i_size = i_size_read(&vnode->netfs.inode); + if (new_i_size > i_size) { + i_size_write(&vnode->netfs.inode, new_i_size); + inode_set_bytes(&vnode->netfs.inode, new_i_size); + } + write_sequnlock(&vnode->cb_lock); + fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size); } -/* - * invalidate part or all of a page - * - release a page and clean up its private data if offset is 0 (indicating - * the entire page) - */ -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) +static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq) { - _enter("{%lu},%zu,%zu", folio->index, offset, length); - - BUG_ON(!folio_test_locked(folio)); + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); - if (folio_get_private(folio)) - afs_invalidate_dirty(folio, offset, length); - - folio_wait_fscache(folio); - _leave(""); + afs_invalidate_cache(vnode, 0); } -/* - * release a page and clean up its private state if it's not busy - * - return true if the page can now be released, false if not - */ -static bool afs_release_folio(struct folio *folio, gfp_t gfp) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - - _enter("{{%llx:%llu}[%lu],%lx},%x", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, - gfp); - - /* deny if folio is being written to the cache and the caller hasn't - * elected to wait */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(afs_vnode_cache(vnode)); -#endif - - if (folio_test_private(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio); - folio_detach_private(folio); - } - - /* Indicate that the folio can be released */ - _leave(" = T"); - return true; -} +const struct netfs_request_ops afs_req_ops = { + .init_request = afs_init_request, + .free_request = afs_free_request, + .check_write_begin = afs_check_write_begin, + .issue_read = afs_issue_read, + .update_i_size = afs_update_i_size, + .invalidate_cache = afs_netfs_invalidate_cache, + .create_write_requests = afs_create_write_requests, +}; static void afs_add_open_mmap(struct afs_vnode *vnode) { @@ -525,13 +417,17 @@ static void afs_add_open_mmap(struct afs_vnode *vnode) static void afs_drop_open_mmap(struct afs_vnode *vnode) { - if (!atomic_dec_and_test(&vnode->cb_nr_mmap)) + if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1)) return; down_write(&vnode->volume->open_mmaps_lock); - if (atomic_read(&vnode->cb_nr_mmap) == 0) + read_seqlock_excl(&vnode->cb_lock); + // the only place where ->cb_nr_mmap may hit 0 + // see __afs_break_callback() for the other side... + if (atomic_dec_and_test(&vnode->cb_nr_mmap)) list_del_init(&vnode->cb_mmap_link); + read_sequnlock_excl(&vnode->cb_lock); up_write(&vnode->volume->open_mmaps_lock); flush_work(&vnode->cb_work); @@ -576,28 +472,39 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); + struct inode *inode = file_inode(iocb->ki_filp); + struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = iocb->ki_filp->private_data; - int ret; + ssize_t ret; - ret = afs_validate(vnode, af->key); + if (iocb->ki_flags & IOCB_DIRECT) + return netfs_unbuffered_read_iter(iocb, iter); + + ret = netfs_start_io_read(inode); if (ret < 0) return ret; - - return generic_file_read_iter(iocb, iter); + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + return ret; } static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(in)); + struct inode *inode = file_inode(in); + struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = in->private_data; - int ret; + ssize_t ret; - ret = afs_validate(vnode, af->key); + ret = netfs_start_io_read(inode); if (ret < 0) return ret; - - return filemap_splice_read(in, ppos, pipe, len, flags); + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_splice_read(in, ppos, pipe, len, flags); + netfs_end_io_read(inode); + return ret; } diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 9c6dea3139f5..f0e96a35093f 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -93,13 +93,13 @@ static void afs_grant_locks(struct afs_vnode *vnode) bool exclusive = (vnode->lock_type == AFS_LOCK_WRITE); list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) { - if (!exclusive && p->fl_type == F_WRLCK) + if (!exclusive && lock_is_write(p)) continue; list_move_tail(&p->fl_u.afs.link, &vnode->granted_locks); p->fl_u.afs.state = AFS_LOCK_GRANTED; trace_afs_flock_op(vnode, p, afs_flock_op_grant); - wake_up(&p->fl_wait); + locks_wake_up(p); } } @@ -112,25 +112,24 @@ static void afs_next_locker(struct afs_vnode *vnode, int error) { struct file_lock *p, *_p, *next = NULL; struct key *key = vnode->lock_key; - unsigned int fl_type = F_RDLCK; + unsigned int type = F_RDLCK; _enter(""); if (vnode->lock_type == AFS_LOCK_WRITE) - fl_type = F_WRLCK; + type = F_WRLCK; list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) { if (error && - p->fl_type == fl_type && - afs_file_key(p->fl_file) == key) { + p->c.flc_type == type && + afs_file_key(p->c.flc_file) == key) { list_del_init(&p->fl_u.afs.link); p->fl_u.afs.state = error; - wake_up(&p->fl_wait); + locks_wake_up(p); } /* Select the next locker to hand off to. */ - if (next && - (next->fl_type == F_WRLCK || p->fl_type == F_RDLCK)) + if (next && (lock_is_write(next) || lock_is_read(p))) continue; next = p; } @@ -142,7 +141,7 @@ static void afs_next_locker(struct afs_vnode *vnode, int error) afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING); next->fl_u.afs.state = AFS_LOCK_YOUR_TRY; trace_afs_flock_op(vnode, next, afs_flock_op_wake); - wake_up(&next->fl_wait); + locks_wake_up(next); } else { afs_set_lock_state(vnode, AFS_VNODE_LOCK_NONE); trace_afs_flock_ev(vnode, NULL, afs_flock_no_lockers, 0); @@ -166,7 +165,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode) struct file_lock, fl_u.afs.link); list_del_init(&p->fl_u.afs.link); p->fl_u.afs.state = -ENOENT; - wake_up(&p->fl_wait); + locks_wake_up(p); } key_put(vnode->lock_key); @@ -464,14 +463,14 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) _enter("{%llx:%llu},%llu-%llu,%u,%u", vnode->fid.vid, vnode->fid.vnode, - fl->fl_start, fl->fl_end, fl->fl_type, mode); + fl->fl_start, fl->fl_end, fl->c.flc_type, mode); fl->fl_ops = &afs_lock_ops; INIT_LIST_HEAD(&fl->fl_u.afs.link); fl->fl_u.afs.state = AFS_LOCK_PENDING; partial = (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX); - type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + type = lock_is_read(fl) ? AFS_LOCK_READ : AFS_LOCK_WRITE; if (mode == afs_flock_mode_write && partial) type = AFS_LOCK_WRITE; @@ -524,7 +523,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) } if (vnode->lock_state == AFS_VNODE_LOCK_NONE && - !(fl->fl_flags & FL_SLEEP)) { + !(fl->c.flc_flags & FL_SLEEP)) { ret = -EAGAIN; if (type == AFS_LOCK_READ) { if (vnode->status.lock_count == -1) @@ -621,7 +620,7 @@ skip_server_lock: return 0; lock_is_contended: - if (!(fl->fl_flags & FL_SLEEP)) { + if (!(fl->c.flc_flags & FL_SLEEP)) { list_del_init(&fl->fl_u.afs.link); afs_next_locker(vnode, 0); ret = -EAGAIN; @@ -641,7 +640,7 @@ need_to_wait: spin_unlock(&vnode->lock); trace_afs_flock_ev(vnode, fl, afs_flock_waiting, 0); - ret = wait_event_interruptible(fl->fl_wait, + ret = wait_event_interruptible(fl->c.flc_wait, fl->fl_u.afs.state != AFS_LOCK_PENDING); trace_afs_flock_ev(vnode, fl, afs_flock_waited, ret); @@ -704,7 +703,8 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; - _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, + fl->c.flc_type); trace_afs_flock_op(vnode, fl, afs_flock_op_unlock); @@ -730,11 +730,11 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) if (vnode->lock_state == AFS_VNODE_LOCK_DELETED) return -ENOENT; - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; /* check local lock records first */ posix_test_lock(file, fl); - if (fl->fl_type == F_UNLCK) { + if (lock_is_unlock(fl)) { /* no local locks; consult the server */ ret = afs_fetch_status(vnode, key, false, NULL); if (ret < 0) @@ -743,18 +743,18 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) lock_count = READ_ONCE(vnode->status.lock_count); if (lock_count != 0) { if (lock_count > 0) - fl->fl_type = F_RDLCK; + fl->c.flc_type = F_RDLCK; else - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; fl->fl_start = 0; fl->fl_end = OFFSET_MAX; - fl->fl_pid = 0; + fl->c.flc_pid = 0; } } ret = 0; error: - _leave(" = %d [%hd]", ret, fl->fl_type); + _leave(" = %d [%hd]", ret, fl->c.flc_type); return ret; } @@ -769,7 +769,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) _enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, - fl->fl_type, fl->fl_flags, + fl->c.flc_type, fl->c.flc_flags, (long long) fl->fl_start, (long long) fl->fl_end); if (IS_GETLK(cmd)) @@ -778,7 +778,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id); trace_afs_flock_op(vnode, fl, afs_flock_op_lock); - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) ret = afs_do_unlk(file, fl); else ret = afs_do_setlk(file, fl); @@ -804,7 +804,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) _enter("{%llx:%llu},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, - fl->fl_type, fl->fl_flags); + fl->c.flc_type, fl->c.flc_flags); /* * No BSD flocks over NFS allowed. @@ -813,14 +813,14 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) * Not sure whether that would be unique, though, or whether * that would break in other places. */ - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id); trace_afs_flock_op(vnode, fl, afs_flock_op_flock); /* we're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) ret = afs_do_unlk(file, fl); else ret = afs_do_setlk(file, fl); @@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file)); _enter(""); @@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file)); _enter(""); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 4f04f6f33f46..94fc049aff58 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren */ static void afs_set_netfs_context(struct afs_vnode *vnode) { - netfs_inode_init(&vnode->netfs, &afs_req_ops); + netfs_inode_init(&vnode->netfs, &afs_req_ops, true); } /* @@ -166,6 +166,7 @@ static void afs_apply_status(struct afs_operation *op, struct inode *inode = &vnode->netfs.inode; struct timespec64 t; umode_t mode; + bool unexpected_jump = false; bool data_changed = false; bool change_size = vp->set_size; @@ -230,6 +231,7 @@ static void afs_apply_status(struct afs_operation *op, } change_size = true; data_changed = true; + unexpected_jump = true; } else if (vnode->status.type == AFS_FTYPE_DIR) { /* Expected directory change is handled elsewhere so * that we can locally edit the directory and save on a @@ -249,8 +251,10 @@ static void afs_apply_status(struct afs_operation *op, * what's on the server. */ vnode->netfs.remote_i_size = status->size; - if (change_size) { + if (change_size || status->size > i_size_read(inode)) { afs_set_i_size(vnode, status->size); + if (unexpected_jump) + vnode->netfs.zero_point = status->size; inode_set_ctime_to_ts(inode, t); inode_set_atime_to_ts(inode, t); } @@ -647,7 +651,7 @@ void afs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); - fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux); + netfs_clear_inode_writeback(inode, &aux); clear_inode(inode); while (!list_empty(&vnode->wb_keys)) { @@ -689,17 +693,17 @@ static void afs_setattr_success(struct afs_operation *op) static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->netfs.inode; + struct afs_vnode *vnode = vp->vnode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; loff_t i_size = op->setattr.old_i_size; - if (size < i_size) - truncate_pagecache(inode, size); - if (size != i_size) - fscache_resize_cookie(afs_vnode_cache(vp->vnode), - vp->scb.status.size); + if (size != i_size) { + truncate_setsize(&vnode->netfs.inode, size); + netfs_resize_file(&vnode->netfs, size, true); + fscache_resize_cookie(afs_vnode_cache(vnode), size); + } } } @@ -767,11 +771,11 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && attr->ia_size < i_size && - attr->ia_size > vnode->status.size) { - truncate_pagecache(inode, attr->ia_size); + attr->ia_size > vnode->netfs.remote_i_size) { + truncate_setsize(inode, attr->ia_size); + netfs_resize_file(&vnode->netfs, size, false); fscache_resize_cookie(afs_vnode_cache(vnode), attr->ia_size); - i_size_write(inode, attr->ia_size); ret = 0; goto out_unlock; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 55aa0679d8ce..6ce5a612937c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -321,8 +321,7 @@ struct afs_net { struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ - struct hlist_head fs_addresses4; /* afs_server (by lowest IPv4 addr) */ - struct hlist_head fs_addresses6; /* afs_server (by lowest IPv6 addr) */ + struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ seqlock_t fs_addr_lock; /* For fs_addresses[46] */ struct work_struct fs_manager; @@ -561,8 +560,7 @@ struct afs_server { struct afs_server __rcu *uuid_next; /* Next server with same UUID */ struct afs_server *uuid_prev; /* Previous server with same UUID */ struct list_head probe_link; /* Link in net->fs_probe_list */ - struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ - struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ + struct hlist_node addr_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ struct afs_server *gc_next; /* Next server in manager's list */ @@ -985,62 +983,6 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl i_size_read(&vnode->netfs.inode), flags); } -/* - * We use folio->private to hold the amount of the folio that we've written to, - * splitting the field into two parts. However, we need to represent a range - * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio - * exceeds what we can encode. - */ -#ifdef CONFIG_64BIT -#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL -#define __AFS_FOLIO_PRIV_SHIFT 32 -#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL -#else -#define __AFS_FOLIO_PRIV_MASK 0x7fffUL -#define __AFS_FOLIO_PRIV_SHIFT 16 -#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL -#endif - -static inline unsigned int afs_folio_dirty_resolution(struct folio *folio) -{ - int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1); - return (shift > 0) ? shift : 0; -} - -static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv) -{ - unsigned long x = priv & __AFS_FOLIO_PRIV_MASK; - - /* The lower bound is inclusive */ - return x << afs_folio_dirty_resolution(folio); -} - -static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv) -{ - unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK; - - /* The upper bound is immediately beyond the region */ - return (x + 1) << afs_folio_dirty_resolution(folio); -} - -static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to) -{ - unsigned int res = afs_folio_dirty_resolution(folio); - from >>= res; - to = (to - 1) >> res; - return (to << __AFS_FOLIO_PRIV_SHIFT) | from; -} - -static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv) -{ - return priv | __AFS_FOLIO_PRIV_MMAPPED; -} - -static inline bool afs_is_folio_dirty_mmapped(unsigned long priv) -{ - return priv & __AFS_FOLIO_PRIV_MMAPPED; -} - #include <trace/events/afs.h> /*****************************************************************************/ @@ -1167,7 +1109,6 @@ extern int afs_release(struct inode *, struct file *); extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); -extern int afs_write_inode(struct inode *, struct writeback_control *); static inline struct afs_read *afs_get_read(struct afs_read *req) { @@ -1658,24 +1599,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ -#ifdef CONFIG_AFS_FSCACHE -bool afs_dirty_folio(struct address_space *, struct folio *); -#else -#define afs_dirty_folio filemap_dirty_folio -#endif -extern int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata); -extern int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); -extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); -extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); -int afs_launder_folio(struct folio *); +void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len); /* * xattr.c diff --git a/fs/afs/main.c b/fs/afs/main.c index 1b3bd21c168a..a14f6013e316 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -90,8 +90,7 @@ static int __net_init afs_net_init(struct net *net_ns) INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); - INIT_HLIST_HEAD(&net->fs_addresses4); - INIT_HLIST_HEAD(&net->fs_addresses6); + INIT_HLIST_HEAD(&net->fs_addresses); seqlock_init(&net->fs_addr_lock); INIT_WORK(&net->fs_manager, afs_manage_servers); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 3bd02571f30d..15eab053af6d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -166,7 +166,7 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) if (!preflist) { seq_puts(m, "NO PREFS\n"); - return 0; + goto out; } seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n", @@ -191,7 +191,8 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) } } - rcu_read_lock(); +out: + rcu_read_unlock(); return 0; } diff --git a/fs/afs/server.c b/fs/afs/server.c index e169121f603e..038f9d0ae3af 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -38,7 +38,7 @@ struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { + hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) { estate = rcu_dereference(server->endpoint_state); alist = estate->addresses; for (i = 0; i < alist->nr_addrs; i++) @@ -177,10 +177,8 @@ added_dup: * bit, but anything we might want to do gets messy and memory * intensive. */ - if (alist->nr_ipv4 > 0) - hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4); - if (alist->nr_addrs > alist->nr_ipv4) - hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6); + if (alist->nr_addrs > 0) + hlist_add_head_rcu(&server->addr_link, &net->fs_addresses); write_sequnlock(&net->fs_addr_lock); @@ -511,10 +509,8 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) list_del(&server->probe_link); hlist_del_rcu(&server->proc_link); - if (!hlist_unhashed(&server->addr4_link)) - hlist_del_rcu(&server->addr4_link); - if (!hlist_unhashed(&server->addr6_link)) - hlist_del_rcu(&server->addr6_link); + if (!hlist_unhashed(&server->addr_link)) + hlist_del_rcu(&server->addr_link); } write_sequnlock(&net->fs_lock); diff --git a/fs/afs/super.c b/fs/afs/super.c index ae2d66a52add..f3ba1c3e72f5 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -55,7 +55,7 @@ int afs_net_id; static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, - .write_inode = afs_write_inode, + .write_inode = netfs_unpin_writeback, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .free_inode = afs_free_inode, diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 020ecd45e476..af3a3f57c1b3 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -353,7 +353,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) { struct afs_server_list *new, *old, *discard; struct afs_vldb_entry *vldb; - char idbuf[16]; + char idbuf[24]; int ret, idsz; _enter(""); @@ -361,7 +361,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) /* We look up an ID by passing it as a decimal string in the * operation's name parameter. */ - idsz = sprintf(idbuf, "%llu", volume->vid); + idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid); vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz); if (IS_ERR(vldb)) { diff --git a/fs/afs/write.c b/fs/afs/write.c index 61d34ad2ca7d..74402d95a884 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -12,309 +12,17 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/netfs.h> +#include <trace/events/netfs.h> #include "internal.h" -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, - bool max_one_loop); - -static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len, - loff_t i_size, bool caching); - -#ifdef CONFIG_AFS_FSCACHE -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -bool afs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - return fscache_dirty_folio(mapping, folio, - afs_vnode_cache(AFS_FS_I(mapping->host))); -} -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ - if (caching) - folio_start_fscache(folio); -} -#else -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ -} -#endif - -/* - * Flush out a conflicting write. This may extend the write to the surrounding - * pages if also dirty and contiguous to the conflicting region.. - */ -static int afs_flush_conflicting_write(struct address_space *mapping, - struct folio *folio) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = folio_pos(folio), - .range_end = LLONG_MAX, - }; - loff_t next; - - return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX, - &next, true); -} - -/* - * prepare to perform part of a write to a page - */ -int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **_page, void **fsdata) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct folio *folio; - unsigned long priv; - unsigned f, from; - unsigned t, to; - pgoff_t index; - int ret; - - _enter("{%llx:%llu},%llx,%x", - vnode->fid.vid, vnode->fid.vnode, pos, len); - - /* Prefetch area to be written into the cache if we're caching this - * file. We need to do this before we get a lock on the page in case - * there's more than one writer competing for the same cache block. - */ - ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata); - if (ret < 0) - return ret; - - index = folio_index(folio); - from = pos - index * PAGE_SIZE; - to = from + len; - -try_again: - /* See if this page is already partially written in a way that we can - * merge the new write with. - */ - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - ASSERTCMP(f, <=, t); - - if (folio_test_writeback(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio); - folio_unlock(folio); - goto wait_for_writeback; - } - /* If the file is being filled locally, allow inter-write - * spaces to be merged into writes. If it's not, only write - * back what the user gives us. - */ - if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && - (to < f || from > t)) - goto flush_conflicting_write; - } - - *_page = folio_file_page(folio, pos / PAGE_SIZE); - _leave(" = 0"); - return 0; - - /* The previous write and this write aren't adjacent or overlapping, so - * flush the page out. - */ -flush_conflicting_write: - trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio); - folio_unlock(folio); - - ret = afs_flush_conflicting_write(mapping, folio); - if (ret < 0) - goto error; - -wait_for_writeback: - ret = folio_wait_writeback_killable(folio); - if (ret < 0) - goto error; - - ret = folio_lock_killable(folio); - if (ret < 0) - goto error; - goto try_again; - -error: - folio_put(folio); - _leave(" = %d", ret); - return ret; -} - -/* - * finalise part of a write to a page - */ -int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *subpage, void *fsdata) -{ - struct folio *folio = page_folio(subpage); - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - unsigned long priv; - unsigned int f, from = offset_in_folio(folio, pos); - unsigned int t, to = from + copied; - loff_t i_size, write_end_pos; - - _enter("{%llx:%llu},{%lx}", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - if (!folio_test_uptodate(folio)) { - if (copied < len) { - copied = 0; - goto out; - } - - folio_mark_uptodate(folio); - } - - if (copied == 0) - goto out; - - write_end_pos = pos + copied; - - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) { - write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) - afs_set_i_size(vnode, write_end_pos); - write_sequnlock(&vnode->cb_lock); - fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos); - } - - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (from < f) - f = from; - if (to > t) - t = to; - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio); - } else { - priv = afs_folio_dirty(folio, from, to); - folio_attach_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio); - } - - if (folio_mark_dirty(folio)) - _debug("dirtied %lx", folio_index(folio)); - -out: - folio_unlock(folio); - folio_put(folio); - return copied; -} - -/* - * kill all the pages in the given range - */ -static void afs_kill_pages(struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("kill %lx (to %lx)", index, last); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = folio_next_index(folio); - - folio_clear_uptodate(folio); - folio_end_writeback(folio); - folio_lock(folio); - generic_error_remove_folio(mapping, folio); - folio_unlock(folio); - folio_put(folio); - - } while (index = next, index <= last); - - _leave(""); -} - -/* - * Redirty all the pages in a given range. - */ -static void afs_redirty_pages(struct writeback_control *wbc, - struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("redirty %llx @%llx", len, start); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = index + folio_nr_pages(folio); - folio_redirty_for_writepage(wbc, folio); - folio_end_writeback(folio); - folio_put(folio); - } while (index = next, index <= last); - - _leave(""); -} - /* * completion of write to server */ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct address_space *mapping = vnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t end; - - XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); - _enter("{%llx:%llu},{%x @%llx}", vnode->fid.vid, vnode->fid.vnode, len, start); - rcu_read_lock(); - - end = (start + len - 1) / PAGE_SIZE; - xas_for_each(&xas, folio, end) { - if (!folio_test_writeback(folio)) { - kdebug("bad %x @%llx page %lx %lx", - len, start, folio_index(folio), end); - ASSERT(folio_test_writeback(folio)); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio); - folio_detach_private(folio); - folio_end_writeback(folio); - } - - rcu_read_unlock(); - afs_prune_wb_keys(vnode); _leave(""); } @@ -451,363 +159,53 @@ try_next_key: return afs_put_operation(op); } -/* - * Extend the region to be written back to include subsequent contiguously - * dirty pages if possible, but don't sleep while doing so. - * - * If this page holds new content, then we can include filler zeros in the - * writeback. - */ -static void afs_extend_writeback(struct address_space *mapping, - struct afs_vnode *vnode, - long *_count, - loff_t start, - loff_t max_len, - bool new_content, - bool caching, - unsigned int *_len) +static void afs_upload_to_server(struct netfs_io_subrequest *subreq) { - struct folio_batch fbatch; - struct folio *folio; - unsigned long priv; - unsigned int psize, filler = 0; - unsigned int f, t; - loff_t len = *_len; - pgoff_t index = (start + len) / PAGE_SIZE; - bool stop = true; - unsigned int i; - - XA_STATE(xas, &mapping->i_pages, index); - folio_batch_init(&fbatch); - - do { - /* Firstly, we gather up a batch of contiguous dirty pages - * under the RCU read lock - but we can't clear the dirty flags - * there if any of those pages are mapped. - */ - rcu_read_lock(); - - xas_for_each(&xas, folio, ULONG_MAX) { - stop = true; - if (xas_retry(&xas, folio)) - continue; - if (xa_is_value(folio)) - break; - if (folio_index(folio) != index) - break; - - if (!folio_try_get_rcu(folio)) { - xas_reset(&xas); - continue; - } - - /* Has the page moved or been split? */ - if (unlikely(folio != xas_reload(&xas))) { - folio_put(folio); - break; - } - - if (!folio_trylock(folio)) { - folio_put(folio); - break; - } - if (!folio_test_dirty(folio) || - folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - folio_put(folio); - break; - } - - psize = folio_size(folio); - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (f != 0 && !new_content) { - folio_unlock(folio); - folio_put(folio); - break; - } - - len += filler + t; - filler = psize - t; - if (len >= max_len || *_count <= 0) - stop = true; - else if (t == psize || new_content) - stop = false; - - index += folio_nr_pages(folio); - if (!folio_batch_add(&fbatch, folio)) - break; - if (stop) - break; - } - - if (!stop) - xas_pause(&xas); - rcu_read_unlock(); - - /* Now, if we obtained any folios, we can shift them to being - * writable and mark them for caching. - */ - if (!folio_batch_count(&fbatch)) - break; - - for (i = 0; i < folio_batch_count(&fbatch); i++) { - folio = fbatch.folios[i]; - trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio); - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - folio_start_writeback(folio); - afs_folio_start_fscache(caching, folio); - - *_count -= folio_nr_pages(folio); - folio_unlock(folio); - } + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + ssize_t ret; - folio_batch_release(&fbatch); - cond_resched(); - } while (!stop); + _enter("%x[%x],%zx", + subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count); - *_len = len; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + ret = afs_store_data(vnode, &subreq->io_iter, subreq->start, + subreq->rreq->origin == NETFS_LAUNDER_WRITE); + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, + false); } -/* - * Synchronously write back the locked page and any subsequent non-locked dirty - * pages. - */ -static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, - struct writeback_control *wbc, - struct folio *folio, - loff_t start, loff_t end) +static void afs_upload_to_server_worker(struct work_struct *work) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct iov_iter iter; - unsigned long priv; - unsigned int offset, to, len, max_len; - loff_t i_size = i_size_read(&vnode->netfs.inode); - bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); - long count = wbc->nr_to_write; - int ret; - - _enter(",%lx,%llx-%llx", folio_index(folio), start, end); - - folio_start_writeback(folio); - afs_folio_start_fscache(caching, folio); - - count -= folio_nr_pages(folio); - - /* Find all consecutive lockable dirty pages that have contiguous - * written regions, stopping when we find a page that is not - * immediately lockable, is not dirty or is missing, or we reach the - * end of the range. - */ - priv = (unsigned long)folio_get_private(folio); - offset = afs_folio_dirty_from(folio, priv); - to = afs_folio_dirty_to(folio, priv); - trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio); - - len = to - offset; - start += offset; - if (start < i_size) { - /* Trim the write to the EOF; the extra data is ignored. Also - * put an upper limit on the size of a single storedata op. - */ - max_len = 65536 * 4096; - max_len = min_t(unsigned long long, max_len, end - start + 1); - max_len = min_t(unsigned long long, max_len, i_size - start); - - if (len < max_len && - (to == folio_size(folio) || new_content)) - afs_extend_writeback(mapping, vnode, &count, - start, max_len, new_content, - caching, &len); - len = min_t(loff_t, len, max_len); - } - - /* We now have a contiguous set of dirty pages, each with writeback - * set; the first page is still locked at this point, but all the rest - * have been unlocked. - */ - folio_unlock(folio); - - if (start < i_size) { - _debug("write back %x @%llx [%llx]", len, start, i_size); - - /* Speculatively write to the cache. We have to fix this up - * later if the store fails. - */ - afs_write_to_cache(vnode, start, len, i_size, caching); - - iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); - ret = afs_store_data(vnode, &iter, start, false); - } else { - _debug("write discard %x @%llx [%llx]", len, start, i_size); - - /* The dirty region was entirely beyond the EOF. */ - fscache_clear_page_bits(mapping, start, len, caching); - afs_pages_written_back(vnode, start, len); - ret = 0; - } - - switch (ret) { - case 0: - wbc->nr_to_write = count; - ret = len; - break; + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); - default: - pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); - fallthrough; - case -EACCES: - case -EPERM: - case -ENOKEY: - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EKEYREVOKED: - case -ENETRESET: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, ret); - break; - - case -EDQUOT: - case -ENOSPC: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, -ENOSPC); - break; - - case -EROFS: - case -EIO: - case -EREMOTEIO: - case -EFBIG: - case -ENOENT: - case -ENOMEDIUM: - case -ENXIO: - trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); - afs_kill_pages(mapping, start, len); - mapping_set_error(mapping, ret); - break; - } - - _leave(" = %d", ret); - return ret; + afs_upload_to_server(subreq); } /* - * write a region of pages back to the server + * Set up write requests for a writeback slice. We need to add a write request + * for each write we want to make. */ -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, - bool max_one_loop) +void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len) { - struct folio *folio; - struct folio_batch fbatch; - ssize_t ret; - unsigned int i; - int n, skips = 0; - - _enter("%llx,%llx,", start, end); - folio_batch_init(&fbatch); - - do { - pgoff_t index = start / PAGE_SIZE; - - n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, &fbatch); - - if (!n) - break; - for (i = 0; i < n; i++) { - folio = fbatch.folios[i]; - start = folio_pos(folio); /* May regress with THPs */ - - _debug("wback %lx", folio_index(folio)); - - /* At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping - */ -try_again: - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); - if (ret < 0) { - folio_batch_release(&fbatch); - return ret; - } - } else { - if (!folio_trylock(folio)) - continue; - } - - if (folio->mapping != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - continue; - } - - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); -#ifdef CONFIG_AFS_FSCACHE - folio_wait_fscache(folio); -#endif - goto try_again; - } - - start += folio_size(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) { - *_next = start; - folio_batch_release(&fbatch); - _leave(" = 0 [%llx]", *_next); - return 0; - } - skips++; - } - continue; - } - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - ret = afs_write_back_from_locked_folio(mapping, wbc, - folio, start, end); - if (ret < 0) { - _leave(" = %zd", ret); - folio_batch_release(&fbatch); - return ret; - } - - start += ret; - } + struct netfs_io_subrequest *subreq; - folio_batch_release(&fbatch); - cond_resched(); - } while (wbc->nr_to_write > 0); + _enter("%x,%llx-%llx", wreq->debug_id, start, start + len); - *_next = start; - _leave(" = 0 [%llx]", *_next); - return 0; + subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER, + start, len, afs_upload_to_server_worker); + if (subreq) + netfs_queue_write_request(subreq); } /* * write some of the pending data back to the server */ -int afs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - loff_t start, next; int ret; - _enter(""); - /* We have to be careful as we can end up racing with setattr() * truncating the pagecache since the caller doesn't take a lock here * to prevent it. @@ -817,69 +215,12 @@ int afs_writepages(struct address_space *mapping, else if (!down_read_trylock(&vnode->validate_lock)) return 0; - if (wbc->range_cyclic) { - start = mapping->writeback_index * PAGE_SIZE; - ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, - &next, false); - if (ret == 0) { - mapping->writeback_index = next / PAGE_SIZE; - if (start > 0 && wbc->nr_to_write > 0) { - ret = afs_writepages_region(mapping, wbc, 0, - start, &next, false); - if (ret == 0) - mapping->writeback_index = - next / PAGE_SIZE; - } - } - } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, - &next, false); - if (wbc->nr_to_write > 0 && ret == 0) - mapping->writeback_index = next / PAGE_SIZE; - } else { - ret = afs_writepages_region(mapping, wbc, - wbc->range_start, wbc->range_end, - &next, false); - } - + ret = netfs_writepages(mapping, wbc); up_read(&vnode->validate_lock); - _leave(" = %d", ret); return ret; } /* - * write to an AFS file - */ -ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); - struct afs_file *af = iocb->ki_filp->private_data; - ssize_t result; - size_t count = iov_iter_count(from); - - _enter("{%llx:%llu},{%zu},", - vnode->fid.vid, vnode->fid.vnode, count); - - if (IS_SWAPFILE(&vnode->netfs.inode)) { - printk(KERN_INFO - "AFS: Attempt to write to active swap file!\n"); - return -EBUSY; - } - - if (!count) - return 0; - - result = afs_validate(vnode, af->key); - if (result < 0) - return result; - - result = generic_file_write_iter(iocb, from); - - _leave(" = %zd", result); - return result; -} - -/* * flush any dirty pages for this process, and check for write errors. * - the return status from this call provides a reliable indication of * whether any write errors occurred for this process. @@ -907,59 +248,11 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; - struct inode *inode = file_inode(file); - struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_file *af = file->private_data; - unsigned long priv; - vm_fault_t ret = VM_FAULT_RETRY; - - _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - afs_validate(vnode, af->key); - sb_start_pagefault(inode->i_sb); - - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. - */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) - goto out; -#endif - - if (folio_wait_writeback_killable(folio)) - goto out; - - if (folio_lock_killable(folio) < 0) - goto out; - - /* We mustn't change folio->private until writeback is complete as that - * details the portion of the page we need to write back and we might - * need to redirty the page if there's a problem. - */ - if (folio_wait_writeback_killable(folio) < 0) { - folio_unlock(folio); - goto out; - } - - priv = afs_folio_dirty(folio, 0, folio_size(folio)); - priv = afs_folio_dirty_mmapped(priv); - if (folio_test_private(folio)) { - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio); - } else { - folio_attach_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio); - } - file_update_time(file); - - ret = VM_FAULT_LOCKED; -out: - sb_end_pagefault(inode->i_sb); - return ret; + if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0) + return VM_FAULT_SIGBUS; + return netfs_page_mkwrite(vmf, NULL); } /* @@ -989,64 +282,3 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) afs_put_wb_key(wbk); } } - -/* - * Clean up a page during invalidation. - */ -int afs_launder_folio(struct folio *folio) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - struct iov_iter iter; - struct bio_vec bv; - unsigned long priv; - unsigned int f, t; - int ret = 0; - - _enter("{%lx}", folio->index); - - priv = (unsigned long)folio_get_private(folio); - if (folio_clear_dirty_for_io(folio)) { - f = 0; - t = folio_size(folio); - if (folio_test_private(folio)) { - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - } - - bvec_set_folio(&bv, folio, t - f, f); - iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len); - - trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); - ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio); - folio_detach_private(folio); - folio_wait_fscache(folio); - return ret; -} - -/* - * Deal with the completion of writing the data to the cache. - */ -static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct afs_vnode *vnode = priv; - - if (IS_ERR_VALUE(transferred_or_error) && - transferred_or_error != -ENOBUFS) - afs_invalidate_cache(vnode, 0); -} - -/* - * Save the write to the cache also. - */ -static void afs_write_to_cache(struct afs_vnode *vnode, - loff_t start, size_t len, loff_t i_size, - bool caching) -{ - fscache_write_to_cache(afs_vnode_cache(vnode), - vnode->netfs.inode.i_mapping, start, len, i_size, - afs_write_to_cache_done, vnode, caching); -} @@ -589,13 +589,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { - struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); - struct kioctx *ctx = req->ki_ctx; + struct aio_kiocb *req; + struct kioctx *ctx; unsigned long flags; + /* + * kiocb didn't come from aio or is neither a read nor a write, hence + * ignore it. + */ + if (!(iocb->ki_flags & IOCB_AIO_RW)) + return; + + req = container_of(iocb, struct aio_kiocb, rw); + if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return; + ctx = req->ki_ctx; + spin_lock_irqsave(&ctx->ctx_lock, flags); list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; @@ -1509,7 +1520,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; - req->ki_flags = req->ki_filp->f_iocb_flags; + req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index d26222b7eefe..0496cb5b6eab 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -79,7 +79,7 @@ static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, - bool secure) + bool make_inode) { struct inode *inode; struct file *file; @@ -87,7 +87,7 @@ static struct file *__anon_inode_getfile(const char *name, if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); - if (secure) { + if (make_inode) { inode = anon_inode_make_secure_inode(name, context_inode); if (IS_ERR(inode)) { file = ERR_CAST(inode); @@ -149,13 +149,10 @@ struct file *anon_inode_getfile(const char *name, EXPORT_SYMBOL_GPL(anon_inode_getfile); /** - * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new + * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new * !S_PRIVATE anon inode rather than reuse the * singleton anon inode and calls the - * inode_init_security_anon() LSM hook. This - * allows for both the inode to have its own - * security context and for the LSM to enforce - * policy on the inode's creation. + * inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -164,11 +161,21 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile); * @context_inode: * [in] the logical relationship with the new inode (optional) * + * Create a new anonymous inode and file pair. This can be done for two + * reasons: + * + * - for the inode to have its own security context, so that LSMs can enforce + * policy on the inode's creation; + * + * - if the caller needs a unique inode, for example in order to customize + * the size returned by fstat() + * * The LSM may use @context_inode in inode_init_security_anon(), but a - * reference to it is not held. Returns the newly created file* or an error - * pointer. See the anon_inode_getfile() documentation for more information. + * reference to it is not held. + * + * Returns the newly created file* or an error pointer. */ -struct file *anon_inode_getfile_secure(const char *name, +struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) @@ -176,12 +183,13 @@ struct file *anon_inode_getfile_secure(const char *name, return __anon_inode_getfile(name, fops, priv, flags, context_inode, true); } +EXPORT_SYMBOL_GPL(anon_inode_create_getfile); static int __anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, - bool secure) + bool make_inode) { int error, fd; struct file *file; @@ -192,7 +200,7 @@ static int __anon_inode_getfd(const char *name, fd = error; file = __anon_inode_getfile(name, fops, priv, flags, context_inode, - secure); + make_inode); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; @@ -231,10 +239,9 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, EXPORT_SYMBOL_GPL(anon_inode_getfd); /** - * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new + * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls - * the inode_init_security_anon() LSM hook. This allows the inode to have its - * own security context and for a LSM to reject creation of the inode. + * the inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -243,16 +250,26 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); * @context_inode: * [in] the logical relationship with the new inode (optional) * + * Create a new anonymous inode and file pair. This can be done for two + * reasons: + * + * - for the inode to have its own security context, so that LSMs can enforce + * policy on the inode's creation; + * + * - if the caller needs a unique inode, for example in order to customize + * the size returned by fstat() + * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. + * + * Returns a newly created file descriptor or an error code. */ -int anon_inode_getfd_secure(const char *name, const struct file_operations *fops, +int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfd(name, fops, priv, flags, context_inode, true); } -EXPORT_SYMBOL_GPL(anon_inode_getfd_secure); static int __init anon_inode_init(void) { diff --git a/fs/attr.c b/fs/attr.c index 5a13f0c8495f..49d23b5dbab4 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -352,7 +352,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode, EXPORT_SYMBOL(may_setattr); /** - * notify_change - modify attributes of a filesytem object + * notify_change - modify attributes of a filesystem object * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes diff --git a/fs/backing-file.c b/fs/backing-file.c index a681f38d84d8..740185198db3 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap); static int __init backing_aio_init(void) { - backing_aio_cachep = kmem_cache_create("backing_aio", - sizeof(struct backing_aio), - 0, SLAB_HWCACHE_ALIGN, NULL); + backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN); if (!backing_aio_cachep) return -ENOMEM; diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 7423a3557c68..1a05cecda7cc 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -27,7 +27,6 @@ bcachefs-y := \ checksum.o \ clock.o \ compress.o \ - counters.o \ darray.o \ debug.o \ dirent.o \ @@ -71,6 +70,7 @@ bcachefs-y := \ reflink.o \ replicas.o \ sb-clean.o \ + sb-counters.o \ sb-downgrade.o \ sb-errors.o \ sb-members.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index a09b9d00226a..fd3e175d8342 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -273,7 +273,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), c, err, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", - bch2_data_types[a.v->data_type]); + bch2_data_type_str(a.v->data_type)); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.v->cached_sectors || @@ -321,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c { struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - unsigned i; prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "gen %u oldest_gen %u data_type %s", - a->gen, a->oldest_gen, - a->data_type < BCH_DATA_NR - ? bch2_data_types[a->data_type] - : "(invalid data type)"); + prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); + bch2_prt_data_type(out, a->data_type); prt_newline(out); prt_printf(out, "journal_seq %llu", a->journal_seq); prt_newline(out); @@ -353,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "fragmentation %llu", a->fragmentation_lru); prt_newline(out); prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); - prt_newline(out); - - if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { - struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); - const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); - - prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); - printbuf_indent_add(out, 2); - - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { - prt_newline(out); - bch2_backpointer_to_text(out, &bps[i]); - } - - printbuf_indent_sub(out, 2); - } - printbuf_indent_sub(out, 2); } @@ -839,7 +818,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, } } - if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; u64 journal_seq = trans->journal_res.seq; u64 bucket_journal_seq = new_a->journal_seq; @@ -1625,13 +1604,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } +struct discard_buckets_state { + u64 seen; + u64 open; + u64 need_journal_commit; + u64 discarded; + struct bch_dev *ca; + u64 need_journal_commit_this_dev; +}; + +static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) +{ + if (s->ca == ca) + return; + + if (s->ca && s->need_journal_commit_this_dev > + bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) + bch2_journal_flush_async(&c->journal, NULL); + + if (s->ca) + percpu_ref_put(&s->ca->ref); + if (ca) + percpu_ref_get(&ca->ref); + s->ca = ca; + s->need_journal_commit_this_dev = 0; +} + static int bch2_discard_one_bucket(struct btree_trans *trans, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, - u64 *seen, - u64 *open, - u64 *need_journal_commit, - u64 *discarded) + struct discard_buckets_state *s) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; @@ -1643,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); return 0; } + discard_buckets_next_dev(c, s, ca); + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { - (*open)++; + s->open++; goto out; } if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, pos.inode, pos.offset)) { - (*need_journal_commit)++; + s->need_journal_commit++; + s->need_journal_commit_this_dev++; goto out; } @@ -1709,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, * This works without any other locks because this is the only * thread that removes items from the need_discard tree */ - bch2_trans_unlock(trans); + bch2_trans_unlock_long(trans); blkdev_issue_discard(ca->disk_sb.bdev, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, @@ -1732,9 +1738,9 @@ write: goto out; count_event(c, bucket_discard); - (*discarded)++; + s->discarded++; out: - (*seen)++; + s->seen++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); @@ -1744,7 +1750,7 @@ out: static void bch2_do_discards_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_work); - u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1756,19 +1762,14 @@ static void bch2_do_discards_work(struct work_struct *work) ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, - &seen, - &open, - &need_journal_commit, - &discarded))); - - if (need_journal_commit * 2 > seen) - bch2_journal_flush_async(&c->journal, NULL); + bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + discard_buckets_next_dev(c, &s, NULL); - trace_discard_buckets(c, seen, open, need_journal_commit, discarded, + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h new file mode 100644 index 000000000000..b4ec20be93b8 --- /dev/null +++ b/fs/bcachefs/alloc_background_format.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H +#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H + +struct bch_alloc { + struct bch_val v; + __u8 fields; + __u8 gen; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V1() \ + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v3 { + struct bch_val v; + __le64 journal_seq; + __le32 flags; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + +struct bch_alloc_v4 { + struct bch_val v; + __u64 journal_seq; + __u32 flags; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 stripe_redundancy; + __u32 dirty_sectors; + __u32 cached_sectors; + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; + __u64 fragmentation_lru; +} __packed __aligned(8); + +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) +BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) +BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + +#define KEY_TYPE_BUCKET_GENS_BITS 8 +#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) +#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) + +struct bch_bucket_gens { + struct bch_val v; + u8 gens[KEY_TYPE_BUCKET_GENS_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index b0ff47998a94..633d3223b353 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1525,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ - prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", + prt_printf(out, "%zu ref %u ", ob - c->open_buckets, - atomic_read(&ob->pin), - data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", + atomic_read(&ob->pin)); + bch2_prt_data_type(out, data_type); + prt_printf(out, " %u:%llu gen %u allocated %u/%u", ob->dev, ob->bucket, ob->gen, ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); if (ob->ec) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index e358a2ffffde..569b97904da4 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -68,9 +68,11 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - prt_str(out, "bucket="); - bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); - prt_str(out, " "); + if (bch2_dev_exists2(c, k.k->p.inode)) { + prt_str(out, "bucket="); + bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + prt_str(out, " "); + } bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); } @@ -400,13 +402,24 @@ int bch2_check_btree_backpointers(struct bch_fs *c) return ret; } +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + +struct extents_to_bp_state { + struct bpos bucket_start; + struct bpos bucket_end; + struct bkey_buf last_flushed; +}; + static int check_bp_exists(struct btree_trans *trans, + struct extents_to_bp_state *s, struct bpos bucket, struct bch_backpointer bp, - struct bkey_s_c orig_k, - struct bpos bucket_start, - struct bpos bucket_end, - struct bkey_buf *last_flushed) + struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; struct btree_iter bp_iter = { NULL }; @@ -417,8 +430,8 @@ static int check_bp_exists(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); - if (bpos_lt(bucket, bucket_start) || - bpos_gt(bucket, bucket_end)) + if (bpos_lt(bucket, s->bucket_start) || + bpos_gt(bucket, s->bucket_end)) return 0; if (!bch2_dev_bucket_exists(c, bucket)) @@ -433,11 +446,9 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) || - bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) || - memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) { - bch2_bkey_buf_reassemble(&tmp, c, orig_k); + bch2_bkey_buf_reassemble(&tmp, c, orig_k); + if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { if (bp.level) { bch2_trans_unlock(trans); bch2_btree_interior_updates_flush(c); @@ -447,7 +458,7 @@ static int check_bp_exists(struct btree_trans *trans, if (ret) goto err; - bch2_bkey_buf_copy(last_flushed, c, tmp.k); + bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } @@ -475,10 +486,8 @@ missing: } static int check_extent_to_backpointers(struct btree_trans *trans, + struct extents_to_bp_state *s, enum btree_id btree, unsigned level, - struct bpos bucket_start, - struct bpos bucket_end, - struct bkey_buf *last_flushed, struct bkey_s_c k) { struct bch_fs *c = trans->c; @@ -498,9 +507,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, bch2_extent_ptr_to_bp(c, btree, level, k, p, &bucket_pos, &bp); - ret = check_bp_exists(trans, bucket_pos, bp, k, - bucket_start, bucket_end, - last_flushed); + ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) return ret; } @@ -509,10 +516,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans, } static int check_btree_root_to_backpointers(struct btree_trans *trans, + struct extents_to_bp_state *s, enum btree_id btree_id, - struct bpos bucket_start, - struct bpos bucket_end, - struct bkey_buf *last_flushed, int *level) { struct bch_fs *c = trans->c; @@ -536,9 +541,7 @@ retry: *level = b->c.level; k = bkey_i_to_s_c(&b->key); - ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1, - bucket_start, bucket_end, - last_flushed, k); + ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -559,7 +562,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c) si_meminfo(&i); mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, btree_bytes(c)); + return div_u64(mem_bytes >> 1, c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, @@ -610,43 +613,35 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, } static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct bpos bucket_start, - struct bpos bucket_end) + struct extents_to_bp_state *s) { struct bch_fs *c = trans->c; - struct btree_iter iter; - enum btree_id btree_id; - struct bkey_s_c k; - struct bkey_buf last_flushed; int ret = 0; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { + for (enum btree_id btree_id = 0; + btree_id < btree_id_nr_alive(c); + btree_id++) { int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_btree_root_to_backpointers(trans, btree_id, - bucket_start, bucket_end, - &last_flushed, &level)); + check_btree_root_to_backpointers(trans, s, btree_id, &level)); if (ret) return ret; while (level >= depth) { + struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, BTREE_ITER_PREFETCH); while (1) { bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&iter); + + struct bkey_s_c k = bch2_btree_iter_peek(&iter); if (!k.k) break; ret = bkey_err(k) ?: - check_extent_to_backpointers(trans, btree_id, level, - bucket_start, bucket_end, - &last_flushed, k) ?: + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { @@ -668,7 +663,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, } } - bch2_bkey_buf_exit(&last_flushed, c); return 0; } @@ -731,37 +725,43 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct bpos start = POS_MIN, end; + struct extents_to_bp_state s = { .bucket_start = POS_MIN }; int ret; + bch2_bkey_buf_init(&s.last_flushed); + bkey_init(&s.last_flushed.k->k); + while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, start, &end); + ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); if (ret) break; - if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX)) + if ( bpos_eq(s.bucket_start, POS_MIN) && + !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) { + if (!bpos_eq(s.bucket_start, POS_MIN) || + !bpos_eq(s.bucket_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, start); + bch2_bpos_to_text(&buf, s.bucket_start); prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, end); + bch2_bpos_to_text(&buf, s.bucket_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } - ret = bch2_check_extents_to_backpointers_pass(trans, start, end); - if (ret || bpos_eq(end, SPOS_MAX)) + ret = bch2_check_extents_to_backpointers_pass(trans, &s); + if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) break; - start = bpos_successor(end); + s.bucket_start = bpos_successor(s.bucket_end); } bch2_trans_put(trans); + bch2_bkey_buf_exit(&s.last_flushed, c); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 737e2396ade7..327365a9feac 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H #define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#include "btree_cache.h" #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index dac383e37181..69d0d60d50e3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1204,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c) return c->opts.block_size >> 9; } -static inline size_t btree_sectors(const struct bch_fs *c) -{ - return c->opts.btree_node_size >> 9; -} - static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) { return c->btree_key_cache_btrees & (1U << btree); @@ -1254,6 +1249,18 @@ static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) return stdio; } +static inline unsigned metadata_replicas_required(struct bch_fs *c) +{ + return min(c->opts.metadata_replicas, + c->opts.metadata_replicas_required); +} + +static inline unsigned data_replicas_required(struct bch_fs *c) +{ + return min(c->opts.data_replicas, + c->opts.data_replicas_required); +} + #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0d5ac4184fbc..0668b682a21c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -417,600 +417,12 @@ struct bch_set { struct bch_val v; }; -/* Extents */ - -/* - * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally - * preceded by checksum/compression information (bch_extent_crc32 or - * bch_extent_crc64). - * - * One major determining factor in the format of extents is how we handle and - * represent extents that have been partially overwritten and thus trimmed: - * - * If an extent is not checksummed or compressed, when the extent is trimmed we - * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the data that is currently - * live. The size field in struct bkey records the current (live) size of the - * extent, and is also used to mean "size of region on disk that we point to" in - * this case. - * - * Thus an extent that is not checksummed or compressed will consist only of a - * list of bch_extent_ptrs, with none of the fields in - * bch_extent_crc32/bch_extent_crc64. - * - * When an extent is checksummed or compressed, it's not possible to read only - * the data that is currently live: we have to read the entire extent that was - * originally written, and then return only the part of the extent that is - * currently live. - * - * Thus, in addition to the current size of the extent in struct bkey, we need - * to store the size of the originally allocated space - this is the - * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, - * when the extent is trimmed, instead of modifying the offset field of the - * pointer, we keep a second smaller offset field - "offset into the original - * extent of the currently live region". - * - * The other major determining factor is replication and data migration: - * - * Each pointer may have its own bch_extent_crc32/64. When doing a replicated - * write, we will initially write all the replicas in the same format, with the - * same checksum type and compression format - however, when copygc runs later (or - * tiering/cache promotion, anything that moves data), it is not in general - * going to rewrite all the pointers at once - one of the replicas may be in a - * bucket on one device that has very little fragmentation while another lives - * in a bucket that has become heavily fragmented, and thus is being rewritten - * sooner than the rest. - * - * Thus it will only move a subset of the pointers (or in the case of - * tiering/cache promotion perhaps add a single pointer without dropping any - * current pointers), and if the extent has been partially overwritten it must - * write only the currently live portion (or copygc would not be able to reduce - * fragmentation!) - which necessitates a different bch_extent_crc format for - * the new pointer. - * - * But in the interests of space efficiency, we don't want to store one - * bch_extent_crc for each pointer if we don't have to. - * - * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and - * bch_extent_ptrs appended arbitrarily one after the other. We determine the - * type of a given entry with a scheme similar to utf8 (except we're encoding a - * type, not a size), encoding the type in the position of the first set bit: - * - * bch_extent_crc32 - 0b1 - * bch_extent_ptr - 0b10 - * bch_extent_crc64 - 0b100 - * - * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and - * bch_extent_crc64 is the least constrained). - * - * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, - * until the next bch_extent_crc32/64. - * - * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer - * is neither checksummed nor compressed. - */ - /* 128 bits, sufficient for cryptographic MACs: */ struct bch_csum { __le64 lo; __le64 hi; } __packed __aligned(8); -#define BCH_EXTENT_ENTRY_TYPES() \ - x(ptr, 0) \ - x(crc32, 1) \ - x(crc64, 2) \ - x(crc128, 3) \ - x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 - -enum bch_extent_entry_type { -#define x(f, n) BCH_EXTENT_ENTRY_##f = n, - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -/* Compressed/uncompressed size are stored biased by 1: */ -struct bch_extent_crc32 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:2, - _compressed_size:7, - _uncompressed_size:7, - offset:7, - _unused:1, - csum_type:4, - compression_type:4; - __u32 csum; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u32 csum; - __u32 compression_type:4, - csum_type:4, - _unused:1, - offset:7, - _uncompressed_size:7, - _compressed_size:7, - type:2; -#endif -} __packed __aligned(8); - -#define CRC32_SIZE_MAX (1U << 7) -#define CRC32_NONCE_MAX 0 - -struct bch_extent_crc64 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:3, - _compressed_size:9, - _uncompressed_size:9, - offset:9, - nonce:10, - csum_type:4, - compression_type:4, - csum_hi:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 csum_hi:16, - compression_type:4, - csum_type:4, - nonce:10, - offset:9, - _uncompressed_size:9, - _compressed_size:9, - type:3; -#endif - __u64 csum_lo; -} __packed __aligned(8); - -#define CRC64_SIZE_MAX (1U << 9) -#define CRC64_NONCE_MAX ((1U << 10) - 1) - -struct bch_extent_crc128 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:4, - _compressed_size:13, - _uncompressed_size:13, - offset:13, - nonce:13, - csum_type:4, - compression_type:4; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 compression_type:4, - csum_type:4, - nonce:13, - offset:13, - _uncompressed_size:13, - _compressed_size:13, - type:4; -#endif - struct bch_csum csum; -} __packed __aligned(8); - -#define CRC128_SIZE_MAX (1U << 13) -#define CRC128_NONCE_MAX ((1U << 13) - 1) - -/* - * @reservation - pointer hasn't been written to, just reserved - */ -struct bch_extent_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:1, - cached:1, - unused:1, - unwritten:1, - offset:44, /* 8 petabytes */ - dev:8, - gen:8; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 gen:8, - dev:8, - offset:44, - unwritten:1, - unused:1, - cached:1, - type:1; -#endif -} __packed __aligned(8); - -struct bch_extent_stripe_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:5, - block:8, - redundancy:4, - idx:47; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:47, - redundancy:4, - block:8, - type:5; -#endif -}; - -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:34, - compression:8, /* enum bch_compression_opt */ - target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 target:16, - compression:8, - unused:34, - type:6; -#endif -}; - -union bch_extent_entry { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 - unsigned long type; -#elif __BITS_PER_LONG == 32 - struct { - unsigned long pad; - unsigned long type; - }; -#else -#error edit for your odd byteorder. -#endif - -#define x(f, n) struct bch_extent_##f f; - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -struct bch_btree_ptr { - struct bch_val v; - - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -struct bch_btree_ptr_v2 { - struct bch_val v; - - __u64 mem_ptr; - __le64 seq; - __le16 sectors_written; - __le16 flags; - struct bpos min_key; - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); - -struct bch_extent { - struct bch_val v; - - __u64 _data[0]; - union bch_extent_entry start[]; -} __packed __aligned(8); - -struct bch_reservation { - struct bch_val v; - - __le32 generation; - __u8 nr_replicas; - __u8 pad[3]; -} __packed __aligned(8); - -/* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(__u64)) - -/* Maximum possible size of an entire extent value: */ -#define BKEY_EXTENT_VAL_U64s_MAX \ - (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) - -/* * Maximum possible size of an entire extent, key + value: */ -#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) - -/* Btree pointers don't carry around checksums: */ -#define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) -#define BKEY_BTREE_PTR_U64s_MAX \ - (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - -/* Inodes */ - -#define BLOCKDEV_INODE_MAX 4096 - -#define BCACHEFS_ROOT_INO 4096 - -struct bch_inode { - struct bch_val v; - - __le64 bi_hash_seed; - __le32 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v2 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v3 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le64 bi_sectors; - __le64 bi_size; - __le64 bi_version; - __u8 fields[]; -} __packed __aligned(8); - -#define INODEv3_FIELDS_START_INITIAL 6 -#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) - -struct bch_inode_generation { - struct bch_val v; - - __le32 bi_generation; - __le32 pad; -} __packed __aligned(8); - -/* - * bi_subvol and bi_parent_subvol are only set for subvolume roots: - */ - -#define BCH_INODE_FIELDS_v2() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_size, 64) \ - x(bi_sectors, 64) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) - -#define BCH_INODE_FIELDS_v3() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) - -/* subset of BCH_INODE_FIELDS */ -#define BCH_INODE_OPTS() \ - x(data_checksum, 8) \ - x(compression, 8) \ - x(project, 32) \ - x(background_compression, 8) \ - x(data_replicas, 8) \ - x(promote_target, 16) \ - x(foreground_target, 16) \ - x(background_target, 16) \ - x(erasure_code, 16) \ - x(nocow, 8) - -enum inode_opt_id { -#define x(name, ...) \ - Inode_opt_##name, - BCH_INODE_OPTS() -#undef x - Inode_opt_nr, -}; - -#define BCH_INODE_FLAGS() \ - x(sync, 0) \ - x(immutable, 1) \ - x(append, 2) \ - x(nodump, 3) \ - x(noatime, 4) \ - x(i_size_dirty, 5) \ - x(i_sectors_dirty, 6) \ - x(unlinked, 7) \ - x(backptr_untrusted, 8) - -/* bits 20+ reserved for packed fields below: */ - -enum bch_inode_flags { -#define x(t, n) BCH_INODE_##t = 1U << n, - BCH_INODE_FLAGS() -#undef x -}; - -enum __bch_inode_flags { -#define x(t, n) __BCH_INODE_##t = n, - BCH_INODE_FLAGS() -#undef x -}; - -LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); - -LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); -LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); -LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_FIELDS_START, - struct bch_inode_v3, bi_flags, 31, 36); -LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); - -/* Dirents */ - -/* - * Dirents (and xattrs) have to implement string lookups; since our b-tree - * doesn't support arbitrary length strings for the key, we instead index by a - * 64 bit hash (currently truncated sha1) of the string, stored in the offset - * field of the key - using linear probing to resolve hash collisions. This also - * provides us with the readdir cookie posix requires. - * - * Linear probing requires us to use whiteouts for deletions, in the event of a - * collision: - */ - -struct bch_dirent { - struct bch_val v; - - /* Target inode number: */ - union { - __le64 d_inum; - struct { /* DT_SUBVOL */ - __le32 d_child_subvol; - __le32 d_parent_subvol; - }; - }; - - /* - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ - __u8 d_type; - - __u8 d_name[]; -} __packed __aligned(8); - -#define DT_SUBVOL 16 -#define BCH_DT_MAX 17 - -#define BCH_NAME_MAX 512 - -/* Xattrs */ - -#define KEY_TYPE_XATTR_INDEX_USER 0 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -#define KEY_TYPE_XATTR_INDEX_SECURITY 4 - -struct bch_xattr { - struct bch_val v; - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; - __u8 x_name[]; -} __packed __aligned(8); - -/* Bucket/allocation information: */ - -struct bch_alloc { - struct bch_val v; - __u8 fields; - __u8 gen; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V1() \ - x(read_time, 16) \ - x(write_time, 16) \ - x(data_type, 8) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ - x(oldest_gen, 8) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bch_alloc_v2 { - struct bch_val v; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V2() \ - x(read_time, 64) \ - x(write_time, 64) \ - x(dirty_sectors, 32) \ - x(cached_sectors, 32) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -struct bch_alloc_v3 { - struct bch_val v; - __le64 journal_seq; - __le32 flags; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) - -struct bch_alloc_v4 { - struct bch_val v; - __u64 journal_seq; - __u32 flags; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 stripe_redundancy; - __u32 dirty_sectors; - __u32 cached_sectors; - __u64 io_time[2]; - __u32 stripe; - __u32 nr_external_backpointers; - __u64 fragmentation_lru; -} __packed __aligned(8); - -#define BCH_ALLOC_V4_U64s_V0 6 -#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) - -BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) -BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) -BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) -BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) - -#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 - struct bch_backpointer { struct bch_val v; __u8 btree_id; @@ -1021,154 +433,6 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); -#define KEY_TYPE_BUCKET_GENS_BITS 8 -#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) -#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) - -struct bch_bucket_gens { - struct bch_val v; - u8 gens[KEY_TYPE_BUCKET_GENS_NR]; -} __packed __aligned(8); - -/* Quotas: */ - -enum quota_types { - QTYP_USR = 0, - QTYP_GRP = 1, - QTYP_PRJ = 2, - QTYP_NR = 3, -}; - -enum quota_counters { - Q_SPC = 0, - Q_INO = 1, - Q_COUNTERS = 2, -}; - -struct bch_quota_counter { - __le64 hardlimit; - __le64 softlimit; -}; - -struct bch_quota { - struct bch_val v; - struct bch_quota_counter c[Q_COUNTERS]; -} __packed __aligned(8); - -/* Erasure coding */ - -struct bch_stripe { - struct bch_val v; - __le16 sectors; - __u8 algorithm; - __u8 nr_blocks; - __u8 nr_redundant; - - __u8 csum_granularity_bits; - __u8 csum_type; - __u8 pad; - - struct bch_extent_ptr ptrs[]; -} __packed __aligned(8); - -/* Reflink: */ - -struct bch_reflink_p { - struct bch_val v; - __le64 idx; - /* - * A reflink pointer might point to an indirect extent which is then - * later split (by copygc or rebalance). If we only pointed to part of - * the original indirect extent, and then one of the fragments is - * outside the range we point to, we'd leak a refcount: so when creating - * reflink pointers, we need to store pad values to remember the full - * range we were taking a reference on. - */ - __le32 front_pad; - __le32 back_pad; -} __packed __aligned(8); - -struct bch_reflink_v { - struct bch_val v; - __le64 refcount; - union bch_extent_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -struct bch_indirect_inline_data { - struct bch_val v; - __le64 refcount; - u8 data[]; -}; - -/* Inline data */ - -struct bch_inline_data { - struct bch_val v; - u8 data[]; -}; - -/* Subvolumes: */ - -#define SUBVOL_POS_MIN POS(0, 1) -#define SUBVOL_POS_MAX POS(0, S32_MAX) -#define BCACHEFS_ROOT_SUBVOL 1 - -struct bch_subvolume { - struct bch_val v; - __le32 flags; - __le32 snapshot; - __le64 inode; - /* - * Snapshot subvolumes form a tree, separate from the snapshot nodes - * tree - if this subvolume is a snapshot, this is the ID of the - * subvolume it was created from: - */ - __le32 parent; - __le32 pad; - bch_le128 otime; -}; - -LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) -/* - * We need to know whether a subvolume is a snapshot so we can know whether we - * can delete it (or whether it should just be rm -rf'd) - */ -LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) -LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) - -/* Snapshots */ - -struct bch_snapshot { - struct bch_val v; - __le32 flags; - __le32 parent; - __le32 children[2]; - __le32 subvol; - /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ - __le32 tree; - __le32 depth; - __le32 skip[3]; -}; - -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) - -/* True if a subvolume points to this snapshot node: */ -LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) - -/* - * Snapshot trees: - * - * The snapshot_trees btree gives us persistent indentifier for each tree of - * bch_snapshot nodes, and allow us to record and easily find the root/master - * subvolume that other snapshots were created from: - */ -struct bch_snapshot_tree { - struct bch_val v; - __le32 master_subvol; - __le32 root_snapshot; -}; - /* LRU btree: */ struct bch_lru { @@ -1178,33 +442,6 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) -/* Logged operations btree: */ - -struct bch_logged_op_truncate { - struct bch_val v; - __le32 subvol; - __le32 pad; - __le64 inum; - __le64 new_i_size; -}; - -enum logged_op_finsert_state { - LOGGED_OP_FINSERT_start, - LOGGED_OP_FINSERT_shift_extents, - LOGGED_OP_FINSERT_finish, -}; - -struct bch_logged_op_finsert { - struct bch_val v; - __u8 state; - __u8 pad[3]; - __le32 subvol; - __le64 inum; - __le64 dst_offset; - __le64 src_offset; - __le64 pos; -}; - /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1230,6 +467,19 @@ struct bch_sb_field { x(ext, 13) \ x(downgrade, 14) +#include "alloc_background_format.h" +#include "extents_format.h" +#include "reflink_format.h" +#include "ec_format.h" +#include "inode_format.h" +#include "dirent_format.h" +#include "xattr_format.h" +#include "quota_format.h" +#include "logged_ops_format.h" +#include "snapshot_format.h" +#include "subvolume_format.h" +#include "sb-counters_format.h" + enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, BCH_SB_FIELDS() @@ -1465,23 +715,6 @@ struct bch_sb_field_replicas { struct bch_replicas_entry_v1 entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_quota: */ - -struct bch_sb_quota_counter { - __le32 timelimit; - __le32 warnlimit; -}; - -struct bch_sb_quota_type { - __le64 flags; - struct bch_sb_quota_counter c[Q_COUNTERS]; -}; - -struct bch_sb_field_quota { - struct bch_sb_field field; - struct bch_sb_quota_type q[QTYP_NR]; -} __packed __aligned(8); - /* BCH_SB_FIELD_disk_groups: */ #define BCH_SB_LABEL_SIZE 32 @@ -1500,101 +733,6 @@ struct bch_sb_field_disk_groups { struct bch_disk_group entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_counters */ - -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) \ - x(bucket_alloc, 5) \ - x(bucket_alloc_fail, 6) \ - x(btree_cache_scan, 7) \ - x(btree_cache_reap, 8) \ - x(btree_cache_cannibalize, 9) \ - x(btree_cache_cannibalize_lock, 10) \ - x(btree_cache_cannibalize_lock_fail, 11) \ - x(btree_cache_cannibalize_unlock, 12) \ - x(btree_node_write, 13) \ - x(btree_node_read, 14) \ - x(btree_node_compact, 15) \ - x(btree_node_merge, 16) \ - x(btree_node_split, 17) \ - x(btree_node_rewrite, 18) \ - x(btree_node_alloc, 19) \ - x(btree_node_free, 20) \ - x(btree_node_set_root, 21) \ - x(btree_path_relock_fail, 22) \ - x(btree_path_upgrade_fail, 23) \ - x(btree_reserve_get_fail, 24) \ - x(journal_entry_full, 25) \ - x(journal_full, 26) \ - x(journal_reclaim_finish, 27) \ - x(journal_reclaim_start, 28) \ - x(journal_write, 29) \ - x(read_promote, 30) \ - x(read_bounce, 31) \ - x(read_split, 33) \ - x(read_retry, 32) \ - x(read_reuse_race, 34) \ - x(move_extent_read, 35) \ - x(move_extent_write, 36) \ - x(move_extent_finish, 37) \ - x(move_extent_fail, 38) \ - x(move_extent_start_fail, 39) \ - x(copygc, 40) \ - x(copygc_wait, 41) \ - x(gc_gens_end, 42) \ - x(gc_gens_start, 43) \ - x(trans_blocked_journal_reclaim, 44) \ - x(trans_restart_btree_node_reused, 45) \ - x(trans_restart_btree_node_split, 46) \ - x(trans_restart_fault_inject, 47) \ - x(trans_restart_iter_upgrade, 48) \ - x(trans_restart_journal_preres_get, 49) \ - x(trans_restart_journal_reclaim, 50) \ - x(trans_restart_journal_res_get, 51) \ - x(trans_restart_key_cache_key_realloced, 52) \ - x(trans_restart_key_cache_raced, 53) \ - x(trans_restart_mark_replicas, 54) \ - x(trans_restart_mem_realloced, 55) \ - x(trans_restart_memory_allocation_failure, 56) \ - x(trans_restart_relock, 57) \ - x(trans_restart_relock_after_fill, 58) \ - x(trans_restart_relock_key_cache_fill, 59) \ - x(trans_restart_relock_next_node, 60) \ - x(trans_restart_relock_parent_for_fill, 61) \ - x(trans_restart_relock_path, 62) \ - x(trans_restart_relock_path_intent, 63) \ - x(trans_restart_too_many_iters, 64) \ - x(trans_restart_traverse, 65) \ - x(trans_restart_upgrade, 66) \ - x(trans_restart_would_deadlock, 67) \ - x(trans_restart_would_deadlock_write, 68) \ - x(trans_restart_injected, 69) \ - x(trans_restart_key_cache_upgrade, 70) \ - x(trans_traverse_all, 71) \ - x(transaction_commit, 72) \ - x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) \ - x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) \ - x(write_buffer_flush_slowpath, 77) \ - x(write_buffer_flush_sync, 78) - -enum bch_persistent_counters { -#define x(t, n, ...) BCH_COUNTER_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_NR -}; - -struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -}; - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index abdb05507d16..76e79a15ba08 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, next_key_bits -= 64; } - bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits)); + bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits)); if (!next_key_bits) break; diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 761f5e33b1e6..5e52684764eb 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, return 0; } +static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k); + + prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie)); +} + #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ .key_invalid = key_type_cookie_invalid, \ + .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index ee82283722b7..03efe8ee565a 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -83,9 +83,10 @@ enum btree_update_flags { __BTREE_TRIGGER_NORUN, __BTREE_TRIGGER_TRANSACTIONAL, + __BTREE_TRIGGER_ATOMIC, + __BTREE_TRIGGER_GC, __BTREE_TRIGGER_INSERT, __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_GC, __BTREE_TRIGGER_BUCKET_INVALIDATE, }; @@ -107,6 +108,10 @@ enum btree_update_flags { * causing us to go emergency read-only) */ #define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) +#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC) + +/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) /* @new is entering the btree */ #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) @@ -114,9 +119,6 @@ enum btree_update_flags { /* @old is leaving the btree */ #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) - /* signal from bucket invalidate path to alloc trigger */ #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 74bf8eb90a4c..3fd1085b6c61 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -720,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); struct bkey_i min_key, max_key; - unsigned j, cacheline = 1; + unsigned cacheline = 1; t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), bset_ro_tree_capacity(b, t)); @@ -823,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i) set_btree_bset(b, t, i); } -void bch2_bset_init_next(struct bch_fs *c, struct btree *b, - struct btree_node_entry *bne) +void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) { struct bset *i = &bne->keys; struct bset_tree *t; - BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); + BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); BUG_ON(b->nsets >= MAX_BSETS); diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 632c2b8c5460..79c77baaa383 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b, void bch2_btree_keys_init(struct btree *); void bch2_bset_init_first(struct btree *, struct bset *); -void bch2_bset_init_next(struct bch_fs *, struct btree *, - struct btree_node_entry *); +void bch2_bset_init_next(struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); void bch2_bset_insert(struct btree *, struct btree_node_iter *, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 8e2488a4b58d..d7c81beac14a 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_bytes(c), gfp); + b->data = kvpmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) bkey_btree_ptr_init(&b->key); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); - b->byte_order = ilog2(btree_bytes(c)); + b->byte_order = ilog2(c->opts.btree_node_size); return b; } @@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, btree_bytes(c)); + kvpfree(c->verify_ondisk, c->opts.btree_node_size); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); @@ -1192,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc " failed unpacked %zu\n", b->unpack_fn_len, b->nr.live_u64s * sizeof(u64), - btree_bytes(c) - sizeof(struct btree_node), + btree_buf_bytes(b) - sizeof(struct btree_node), b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 4e1af5882052..6d33885fdbde 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b) _iter = 0; _iter < (_tbl)->size; _iter++) \ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) -static inline size_t btree_bytes(struct bch_fs *c) +static inline size_t btree_buf_bytes(const struct btree *b) { - return c->opts.btree_node_size; + return 1UL << b->byte_order; } -static inline size_t btree_max_u64s(struct bch_fs *c) +static inline size_t btree_buf_max_u64s(const struct btree *b) { - return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); + return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64); } -static inline size_t btree_pages(struct bch_fs *c) +static inline size_t btree_max_u64s(const struct bch_fs *c) { - return btree_bytes(c) / PAGE_SIZE; + return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64); } -static inline unsigned btree_blocks(struct bch_fs *c) +static inline size_t btree_sectors(const struct bch_fs *c) +{ + return c->opts.btree_node_size >> SECTOR_SHIFT; +} + +static inline unsigned btree_blocks(const struct bch_fs *c) { return btree_sectors(c) >> c->block_bits; } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 49b4ade758c3..1102995643b1 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -597,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -615,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -637,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -649,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -664,8 +664,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->data_type], - bch2_data_types[data_type], + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { @@ -1238,11 +1238,11 @@ static int bch2_gc_done(struct bch_fs *c, for (i = 0; i < BCH_DATA_NR; i++) { copy_dev_field(dev_usage_buckets_wrong, - d[i].buckets, "%s buckets", bch2_data_types[i]); + d[i].buckets, "%s buckets", bch2_data_type_str(i)); copy_dev_field(dev_usage_sectors_wrong, - d[i].sectors, "%s sectors", bch2_data_types[i]); + d[i].sectors, "%s sectors", bch2_data_type_str(i)); copy_dev_field(dev_usage_fragmented_wrong, - d[i].fragmented, "%s fragmented", bch2_data_types[i]); + d[i].fragmented, "%s fragmented", bch2_data_type_str(i)); } } @@ -1253,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c, bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); copy_fs_field(fs_usage_hidden_wrong, - hidden, "hidden"); + b.hidden, "hidden"); copy_fs_field(fs_usage_btree_wrong, - btree, "btree"); + b.btree, "btree"); if (!metadata_only) { copy_fs_field(fs_usage_data_wrong, - data, "data"); + b.data, "data"); copy_fs_field(fs_usage_cached_wrong, - cached, "cached"); + b.cached, "cached"); copy_fs_field(fs_usage_reserved_wrong, - reserved, "reserved"); + b.reserved, "reserved"); copy_fs_field(fs_usage_nr_inodes_wrong, - nr_inodes,"nr_inodes"); + b.nr_inodes,"nr_inodes"); for (i = 0; i < BCH_REPLICAS_MAX; i++) copy_fs_field(fs_usage_persistent_reserved_wrong, @@ -1417,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %s, should be %s", iter->pos.inode, iter->pos.offset, gc.gen, - bch2_data_types[new.data_type], - bch2_data_types[gc.data_type])) + bch2_data_type_str(new.data_type), + bch2_data_type_str(gc.data_type))) new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ @@ -1428,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ - bch2_data_types[gc.data_type], \ + bch2_data_type_str(gc.data_type), \ new._f, gc._f)) \ new._f = gc._f; \ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 33db48e2153f..aa9b6cbe3226 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, unsigned flags = memalloc_nofs_save(); void *p; - BUG_ON(size > btree_bytes(c)); + BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); @@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ptrs = ptrs_end = ((void *) new_whiteouts + bytes); - for (k = unwritten_whiteouts_start(c, b); - k != unwritten_whiteouts_end(c, b); + for (k = unwritten_whiteouts_start(b); + k != unwritten_whiteouts_end(b); k = bkey_p_next(k)) *--ptrs = k; @@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) verify_no_dups(b, new_whiteouts, (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); - memcpy_u64s(unwritten_whiteouts_start(c, b), + memcpy_u64s(unwritten_whiteouts_start(b), new_whiteouts, b->whiteout_u64s); btree_bounce_free(c, bytes, used_mempool, new_whiteouts); @@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, } bytes = sorting_entire_node - ? btree_bytes(c) + ? btree_buf_bytes(b) : __vstruct_bytes(struct btree_node, u64s); out = btree_bounce_alloc(c, bytes, &used_mempool); @@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, if (sorting_entire_node) { u64s = le16_to_cpu(out->keys.u64s); - BUG_ON(bytes != btree_bytes(c)); + BUG_ON(bytes != btree_buf_bytes(b)); /* * Our temporary buffer is the same size as the btree node's @@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); @@ -1160,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ptr_written, b->written); } else { for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_bytes(c); + bset_byte_offset(b, bne) < btree_buf_bytes(b); bne = (void *) bne + block_bytes(c)) btree_err_on(bne->keys.seq == b->data->keys.seq && !bch2_journal_seq_is_blacklisted(c, @@ -1172,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "found bset signature after last bset"); } - sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); sorted->keys.u64s = 0; set_btree_bset(b, b->set, &b->data->keys); @@ -1188,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, BUG_ON(b->nr.live_u64s != u64s); - btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); if (updated_range) bch2_btree_node_drop_keys_outside_node(b); @@ -1284,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work) rb->have_ioref = bch2_dev_get_ioref(ca, READ); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); + bio->bi_iter.bi_size = btree_buf_bytes(b); if (rb->have_ioref) { bio_set_dev(bio, ca->disk_sb.bdev); @@ -1512,7 +1512,7 @@ fsck_err: } if (best >= 0) { - memcpy(b->data, ra->buf[best], btree_bytes(c)); + memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); } else { ret = -1; @@ -1578,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool for (i = 0; i < ra->nr; i++) { ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); ra->bio[i] = bio_alloc_bioset(NULL, - buf_pages(ra->buf[i], btree_bytes(c)), + buf_pages(ra->buf[i], btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1598,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool rb->pick = pick; rb->bio.bi_iter.bi_sector = pick.ptr.offset; rb->bio.bi_end_io = btree_node_read_all_replicas_endio; - bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -1665,7 +1665,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, ca = bch_dev_bkey_exists(c, pick.ptr.dev); bio = bio_alloc_bioset(NULL, - buf_pages(b->data, btree_bytes(c)), + buf_pages(b->data, btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1679,7 +1679,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_end_io = btree_node_read_endio; - bch2_bio_map(bio, b->data, btree_bytes(c)); + bch2_bio_map(bio, b->data, btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -2074,8 +2074,8 @@ do_write: i->u64s = 0; sort_iter_add(&sort_iter.iter, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); + unwritten_whiteouts_start(b), + unwritten_whiteouts_end(b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); b->whiteout_u64s = 0; @@ -2251,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index fa298289e016..3ef338df82f5 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1337,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in if (path->should_be_locked && !trans->restarted && - (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) + (!dup || !bch2_btree_path_relock_norestart(trans, dup))) return; if (dup) { @@ -2156,7 +2156,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * isn't monotonically increasing before FILTER_SNAPSHOTS, and * that's what we check against in extents mode: */ - if (k.k->p.inode > end.inode) + if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_gt(k.k->p, end) + : k.k->p.inode > end.inode)) goto end; if (iter->update_path && diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index da2b74fa63fc..24772538e4cc 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -819,6 +819,11 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) +/* + * This should not be used in a fastpath, without first trying _do in + * nonblocking mode - it will cause excessive transaction restarts and + * potentially livelocking: + */ #define drop_locks_do(_trans, _do) \ ({ \ bch2_trans_unlock(_trans); \ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 2d1c95c42f24..684397442338 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -92,7 +92,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) continue; bch2_btree_trans_to_text(out, i->trans); - bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1); + bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT); } } @@ -227,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) prt_printf(&buf, "backtrace:"); prt_newline(&buf); printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } @@ -631,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, } __flatten -bool bch2_btree_path_relock_norestart(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { struct get_locks_fail f; @@ -642,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, int __bch2_btree_path_relock(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + if (!bch2_btree_path_relock_norestart(trans, path)) { trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); } @@ -759,12 +758,39 @@ int bch2_trans_relock(struct btree_trans *trans) if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path, i) + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; + if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { - trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); + !btree_path_get_locks(trans, path, false, &f)) { + if (trace_trans_restart_relock_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, " l=%u seq=%u node seq=", + f.l, path->l[f.l].lock_seq); + if (IS_ERR_OR_NULL(f.b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f.b))); + } else { + prt_printf(&buf, "%u", f.b->c.lock.seq); + + struct six_lock_count c = + bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l); + prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + + c = six_lock_counts(&f.b->c.lock); + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + } + + trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); + } + + count_event(trans->c, trans_restart_relock); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } + } + return 0; } @@ -778,7 +804,7 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) trans_for_each_path(trans, path, i) if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + !bch2_btree_path_relock_norestart(trans, path)) { return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } return 0; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index cc5500a957a1..4bd72c855da1 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -312,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *, /* relock: */ -bool bch2_btree_path_relock_norestart(struct btree_trans *, - struct btree_path *, unsigned long); +bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *); int __bch2_btree_path_relock(struct btree_trans *, struct btree_path *, unsigned long); @@ -353,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ - -struct get_locks_fail { - unsigned l; - struct btree *b; -}; - bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, struct btree_path *, unsigned, struct get_locks_fail *); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 90eb8065ff2d..30d69a6d133e 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -139,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, b)); + EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); k = bch2_btree_node_iter_peek_all(node_iter, b); @@ -160,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, k->type = KEY_TYPE_deleted; if (k->needs_whiteout) - push_whiteout(trans->c, b, insert->k.p); + push_whiteout(b, insert->k.p); k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { @@ -348,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans) static inline int btree_key_can_insert(struct btree_trans *trans, struct btree *b, unsigned u64s) { - struct bch_fs *c = trans->c; - - if (!bch2_btree_node_insert_fits(c, b, u64s)) + if (!bch2_btree_node_insert_fits(b, u64s)) return -BCH_ERR_btree_insert_btree_node_full; return 0; @@ -418,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_k)) return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); @@ -448,9 +445,6 @@ static int run_one_mem_trigger(struct btree_trans *trans, if (unlikely(flags & BTREE_TRIGGER_NORUN)) return 0; - if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) - return 0; - if (old_ops->trigger == new_ops->trigger) { ret = bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(new), @@ -586,9 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - int ret = 0; - trans_for_each_update(trans, i) { /* * XXX: synchronization of cached update triggers with gc @@ -596,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) */ BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(insert_l(trans, i)->b))) { - ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && + gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); if (ret) - break; + return ret; } } - return ret; + return 0; } static inline int @@ -680,6 +672,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); + h = trans->hooks; while (h) { ret = h->fn(trans, h); @@ -689,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, i->flags); + if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); if (ret) goto fatal_err; } @@ -994,6 +989,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) !trans->journal_entries_u64s) goto out_reset; + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + ret = bch2_trans_commit_run_triggers(trans); if (ret) goto out_reset; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index d530307046f4..4a5a64499eb7 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -430,6 +430,9 @@ struct btree_trans { struct journal_res journal_res; u64 *journal_seq; struct disk_reservation *disk_res; + + struct bch_fs_usage_base fs_usage_delta; + unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ struct replicas_delta_list *fs_usage_deltas; @@ -653,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_reflink)| \ BIT_ULL(BKEY_TYPE_btree)) -#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ +#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ (BIT_ULL(BKEY_TYPE_alloc)| \ BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ @@ -661,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ - BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { @@ -738,4 +741,9 @@ enum btree_node_sibling { btree_next_sib, }; +struct get_locks_fail { + unsigned l; + struct btree *b; +}; + #endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 44f9dfa28a09..4530b14ff2c3 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -159,7 +159,7 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, { size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); - return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); + return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); } /* Btree node freeing/allocation: */ @@ -280,7 +280,8 @@ retry: writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, - c->opts.metadata_replicas_required, + min(res->nr_replicas, + c->opts.metadata_replicas_required), watermark, 0, cl, &wp); if (unlikely(ret)) return ERR_PTR(ret); @@ -1097,7 +1098,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(c, path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[update_level].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; @@ -1401,7 +1402,7 @@ static void __btree_split_node(struct btree_update *as, unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + nr_keys[i].val_u64s; - if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c)) + if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) n[i]->data->format = b->format; btree_node_set_format(n[i], n[i]->data->format); @@ -1703,7 +1704,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_prep_for_write(trans, path, b); - if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { bch2_btree_node_unlock_write(trans, path, b); goto split; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index adfc62083844..c593c925d1e3 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -184,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b) b->sib_u64s[1] = b->nr.live_u64s; } -static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +static inline void *btree_data_end(struct btree *b) { - return (void *) b->data + btree_bytes(c); + return (void *) b->data + btree_buf_bytes(b); } -static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b) { - return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); + return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s); } -static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b) { - return btree_data_end(c, b); + return btree_data_end(b); } static inline void *write_block(struct btree *b) @@ -221,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k) return __btree_addr_written(b, k); } -static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, - struct btree *b, - void *end) +static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end) { ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + b->whiteout_u64s; - ssize_t total = c->opts.btree_node_size >> 3; + ssize_t total = btree_buf_bytes(b) >> 3; /* Always leave one extra u64 for bch2_varint_decode: */ used++; @@ -235,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, return total - used; } -static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, - struct btree *b) +static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b) { - ssize_t remaining = __bch_btree_u64s_remaining(c, b, + ssize_t remaining = __bch2_btree_u64s_remaining(b, btree_bkey_last(b, bset_tree_last(b))); BUG_ON(remaining < 0); @@ -260,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b) return 8 << BTREE_WRITE_SET_U64s_BITS; } -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, - struct btree *b) +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = - __bch_btree_u64s_remaining(c, b, bne->keys.start); + __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) @@ -281,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, return NULL; } -static inline void push_whiteout(struct bch_fs *c, struct btree *b, - struct bpos pos) +static inline void push_whiteout(struct btree *b, struct bpos pos) { struct bkey_packed k; - BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s); EBUG_ON(btree_node_just_written(b)); if (!bkey_pack_pos(&k, pos, b)) { @@ -299,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, k.needs_whiteout = true; b->whiteout_u64s += k.u64s; - bkey_p_copy(unwritten_whiteouts_start(c, b), &k); + bkey_p_copy(unwritten_whiteouts_start(b), &k); } /* * write lock must be held on @b (else the dirty bset that we were going to * insert into could be written out from under us) */ -static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, - struct btree *b, unsigned u64s) +static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s) { if (unlikely(btree_node_need_rewrite(b))) return false; - return u64s <= bch_btree_keys_u64s_remaining(c, b); + return u64s <= bch2_btree_keys_u64s_remaining(b); } void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 5c1169c78daf..ac7844861966 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -125,13 +125,12 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite struct btree_write_buffered_key *wb, bool *write_locked, size_t *fast) { - struct bch_fs *c = trans->c; struct btree_path *path; int ret; EBUG_ON(!wb->journal_seq); - EBUG_ON(!c->btree_write_buffer.flushing.pin.seq); - EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); ret = bch2_btree_iter_traverse(iter); if (ret) @@ -155,7 +154,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite *write_locked = true; } - if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) { + if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) { *write_locked = false; return wb_flush_one_slowpath(trans, iter, wb); } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index d83ea0e53df3..54f7826ac498 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -25,7 +25,7 @@ #include <linux/preempt.h> -static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, +static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, enum bch_data_type data_type, s64 sectors) { @@ -54,20 +54,20 @@ void bch2_fs_usage_initialize(struct bch_fs *c) bch2_fs_usage_acc_to_base(c, i); for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) - usage->reserved += usage->persistent_reserved[i]; + usage->b.reserved += usage->persistent_reserved[i]; for (unsigned i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); + fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]); } for_each_member_device(c, ca) { struct bch_dev_usage dev = bch2_dev_usage_read(ca); - usage->hidden += (dev.d[BCH_DATA_sb].buckets + - dev.d[BCH_DATA_journal].buckets) * + usage->b.hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * ca->mi.bucket_size; } @@ -188,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out, prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); prt_printf(out, "hidden:\t\t\t\t%llu\n", - fs_usage->u.hidden); + fs_usage->u.b.hidden); prt_printf(out, "data:\t\t\t\t%llu\n", - fs_usage->u.data); + fs_usage->u.b.data); prt_printf(out, "cached:\t\t\t\t%llu\n", - fs_usage->u.cached); + fs_usage->u.b.cached); prt_printf(out, "reserved:\t\t\t%llu\n", - fs_usage->u.reserved); + fs_usage->u.b.reserved); prt_printf(out, "nr_inodes:\t\t\t%llu\n", - fs_usage->u.nr_inodes); + fs_usage->u.b.nr_inodes); prt_printf(out, "online reserved:\t\t%llu\n", fs_usage->online_reserved); @@ -225,10 +225,10 @@ static u64 reserve_factor(u64 r) u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) { - return min(fs_usage->u.hidden + - fs_usage->u.btree + - fs_usage->u.data + - reserve_factor(fs_usage->u.reserved + + return min(fs_usage->u.b.hidden + + fs_usage->u.b.btree + + fs_usage->u.b.data + + reserve_factor(fs_usage->u.b.reserved + fs_usage->online_reserved), c->capacity); } @@ -240,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c) u64 data, reserved; ret.capacity = c->capacity - - bch2_fs_usage_read_one(c, &c->usage_base->hidden); + bch2_fs_usage_read_one(c, &c->usage_base->b.hidden); - data = bch2_fs_usage_read_one(c, &c->usage_base->data) + - bch2_fs_usage_read_one(c, &c->usage_base->btree); - reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + + data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) + + bch2_fs_usage_read_one(c, &c->usage_base->b.btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; - ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); + ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes); return ret; } @@ -284,7 +284,7 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) prt_newline(out); for (unsigned i = 0; i < BCH_DATA_NR; i++) { - prt_str(out, bch2_data_types[i]); + bch2_prt_data_type(out, i); prt_tab(out); prt_u64(out, usage->d[i].buckets); prt_tab_rjust(out); @@ -308,9 +308,9 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, fs_usage = fs_usage_ptr(c, journal_seq, gc); if (data_type_is_hidden(old->data_type)) - fs_usage->hidden -= ca->mi.bucket_size; + fs_usage->b.hidden -= ca->mi.bucket_size; if (data_type_is_hidden(new->data_type)) - fs_usage->hidden += ca->mi.bucket_size; + fs_usage->b.hidden += ca->mi.bucket_size; u = dev_usage_ptr(ca, journal_seq, gc); @@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c, if (idx < 0) return -1; - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; return 0; } @@ -394,7 +394,7 @@ int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k, preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; preempt_enable(); err: @@ -523,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on(g->data_type && g->data_type != data_type, c, "different types of data in same bucket: %s, %s", - bch2_data_types[g->data_type], - bch2_data_types[data_type])) { + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) { ret = -EIO; goto err; } @@ -532,7 +532,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, - bch2_data_types[g->data_type ?: data_type], + bch2_data_type_str(g->data_type ?: data_type), g->dirty_sectors, sectors)) { ret = -EIO; goto err; @@ -575,7 +575,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -588,7 +588,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -603,7 +603,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "while marking %s", ptr->dev, bucket_nr, b_gen, *bucket_gen(ca, bucket_nr), - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -624,8 +624,8 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type], - bch2_data_types[ptr_data_type], + bch2_data_type_str(bucket_data_type), + bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -638,7 +638,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), bucket_sectors, sectors, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -677,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); } - dst->nr_inodes -= deltas->nr_inodes; + dst->b.nr_inodes -= deltas->nr_inodes; for (i = 0; i < BCH_REPLICAS_MAX; i++) { added -= deltas->persistent_reserved[i]; - dst->reserved -= deltas->persistent_reserved[i]; + dst->b.reserved -= deltas->persistent_reserved[i]; dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; } @@ -694,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, percpu_up_read(&c->mark_lock); } -int bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct replicas_delta_list *deltas) +void bch2_trans_account_disk_usage_change(struct btree_trans *trans) { struct bch_fs *c = trans->c; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; static int warned_disk_usage = 0; bool warn = false; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - struct replicas_delta *d, *d2; - struct replicas_delta *top = (void *) deltas->d + deltas->used; - struct bch_fs_usage *dst; - s64 added = 0, should_not_have_added; - unsigned i; percpu_down_read(&c->mark_lock); preempt_disable(); - dst = fs_usage_ptr(c, trans->journal_res.seq, false); - - for (d = deltas->d; d != top; d = replicas_delta_next(d)) { - switch (d->r.data_type) { - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - added += d->delta; - } + struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b; + struct bch_fs_usage_base *src = &trans->fs_usage_delta; - if (__update_replicas(c, dst, &d->r, d->delta)) - goto need_mark; - } - - dst->nr_inodes += deltas->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - added += deltas->persistent_reserved[i]; - dst->reserved += deltas->persistent_reserved[i]; - dst->persistent_reserved[i] += deltas->persistent_reserved[i]; - } + s64 added = src->btree + src->data + src->reserved; /* * Not allowed to reduce sectors_available except by getting a * reservation: */ - should_not_have_added = added - (s64) disk_res_sectors; + s64 should_not_have_added = added - (s64) disk_res_sectors; if (unlikely(should_not_have_added > 0)) { u64 old, new, v = atomic64_read(&c->sectors_available); @@ -754,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, this_cpu_sub(*c->online_reserved, added); } + dst->hidden += src->hidden; + dst->btree += src->btree; + dst->data += src->data; + dst->cached += src->cached; + dst->reserved += src->reserved; + dst->nr_inodes += src->nr_inodes; + preempt_enable(); percpu_up_read(&c->mark_lock); @@ -761,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, bch2_trans_inconsistent(trans, "disk usage increased %lli more than %llu sectors reserved)", should_not_have_added, disk_res_sectors); +} + +int bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + struct replicas_delta *d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + unsigned i; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + for (d = deltas->d; d != top; d = replicas_delta_next(d)) + if (__update_replicas(c, dst, &d->r, d->delta)) + goto need_mark; + + dst->b.nr_inodes += deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + dst->b.reserved += deltas->persistent_reserved[i]; + dst->persistent_reserved[i] += deltas->persistent_reserved[i]; + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); return 0; need_mark: /* revert changes: */ @@ -1084,7 +1096,7 @@ static int __trigger_reservation(struct btree_trans *trans, struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc); replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved)); - fs_usage->reserved += sectors; + fs_usage->b.reserved += sectors; fs_usage->persistent_reserved[replicas - 1] += sectors; preempt_enable(); @@ -1130,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - bch2_data_types[type], - bch2_data_types[type]); + bch2_data_type_str(a->v.data_type), + bch2_data_type_str(type), + bch2_data_type_str(type)); ret = -EIO; goto err; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 2c95cc5d86be..6387e039f789 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -356,6 +356,8 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, ret; \ }) +void bch2_trans_account_disk_usage_change(struct btree_trans *); + void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); @@ -385,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) return false; } +static inline const char *bch2_data_type_str(enum bch_data_type type) +{ + return type < BCH_DATA_NR + ? __bch2_data_types[type] + : "(invalid data type)"; +} + +static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) +{ + if (type < BCH_DATA_NR) + prt_str(out, __bch2_data_types[type]); + else + prt_printf(out, "(invalid data type %u)", type); +} + /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 783f71017204..6a31740222a7 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -45,23 +45,18 @@ struct bch_dev_usage { } d[BCH_DATA_NR]; }; -struct bch_fs_usage { - /* all fields are in units of 512 byte sectors: */ +struct bch_fs_usage_base { u64 hidden; u64 btree; u64 data; u64 cached; u64 reserved; u64 nr_inodes; +}; - /* XXX: add stats for compression ratio */ -#if 0 - u64 uncompressed; - u64 compressed; -#endif - - /* broken out: */ - +struct bch_fs_usage { + /* all fields are in units of 512 byte sectors: */ + struct bch_fs_usage_base b; u64 persistent_reserved[BCH_REPLICAS_MAX]; u64 replicas[]; }; diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index f41889093a2c..363644451106 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); - while (1) { + do { set_current_state(TASK_INTERRUPTIBLE); if (kthread && kthread_should_stop()) break; @@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, schedule(); try_to_freeze(); - } + } while (0); __set_current_state(TASK_RUNNING); del_timer_sync(&wait.cpu_timer); diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 607fd5e232c9..58c2eb45570f 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } +static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) +{ + if (type < BCH_COMPRESSION_TYPE_NR) + prt_str(out, __bch2_compression_types[type]); + else + prt_printf(out, "(invalid compression type %u)", type); +} + int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 6f13477ff652..4150feca42a2 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -285,9 +285,7 @@ restart_drop_extra_replicas: k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, insert, - op->opts.background_target, - op->opts.background_compression) ?: + bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, @@ -529,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_DATA_ENCODED| BCH_WRITE_MOVE| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; + m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; bkey_for_each_ptr(ptrs, ptr) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index d6418948495f..7bdba8507fc9 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_sorted, btree_bytes(c)), + buf_pages(n_sorted, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_sorted, btree_bytes(c)); + bch2_bio_map(bio, n_sorted, btree_buf_bytes(b)); submit_bio_wait(bio); bio_put(bio); percpu_ref_put(&ca->io_ref); - memcpy(n_ondisk, n_sorted, btree_bytes(c)); + memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) @@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; } bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_ondisk, btree_bytes(c)), + buf_pages(n_ondisk, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_ondisk, btree_bytes(c)); + bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b)); ret = submit_bio_wait(bio); if (ret) { @@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_bytes(c)); + kvpfree(n_ondisk, btree_buf_bytes(b)); percpu_ref_put(&ca->io_ref); } @@ -627,7 +627,7 @@ restart: prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, task, 0); + bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h new file mode 100644 index 000000000000..5e116b88e814 --- /dev/null +++ b/fs/bcachefs/dirent_format.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DIRENT_FORMAT_H +#define _BCACHEFS_DIRENT_FORMAT_H + +/* + * Dirents (and xattrs) have to implement string lookups; since our b-tree + * doesn't support arbitrary length strings for the key, we instead index by a + * 64 bit hash (currently truncated sha1) of the string, stored in the offset + * field of the key - using linear probing to resolve hash collisions. This also + * provides us with the readdir cookie posix requires. + * + * Linear probing requires us to use whiteouts for deletions, in the event of a + * collision: + */ + +struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ + union { + __le64 d_inum; + struct { /* DT_SUBVOL */ + __le32 d_child_subvol; + __le32 d_parent_subvol; + }; + }; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get + * the filetype without having to do a stat() + */ + __u8 d_type; + + __u8 d_name[]; +} __packed __aligned(8); + +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + +#define BCH_NAME_MAX 512 + +#endif /* _BCACHEFS_DIRENT_FORMAT_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d802bc63c8d0..d503af270024 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -190,7 +190,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, a->v.stripe_redundancy, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], + bch2_data_type_str(a->v.data_type), a->v.dirty_sectors, a->v.stripe, s.k->p.offset)) { ret = -EIO; @@ -200,7 +200,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], + bch2_data_type_str(a->v.data_type), a->v.dirty_sectors, s.k->p.offset)) { ret = -EIO; @@ -367,7 +367,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, } } - if (!(flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))) { + if (flags & BTREE_TRIGGER_ATOMIC) { struct stripe *m = genradix_ptr(&c->stripes, idx); if (!m) { diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h new file mode 100644 index 000000000000..44ce88ba08d7 --- /dev/null +++ b/fs/bcachefs/ec_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_FORMAT_H +#define _BCACHEFS_EC_FORMAT_H + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_EC_FORMAT_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 82ec056f4cdb..61395b113df9 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "bkey_methods.h" +#include "btree_cache.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" @@ -1018,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type], - bch2_compression_types[crc.compression_type]); + bch2_csum_types[crc.csum_type]); + bch2_prt_compression_type(out, crc.compression_type); break; } case BCH_EXTENT_ENTRY_stripe_ptr: { @@ -1334,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) } int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, - unsigned target, unsigned compression) + struct bch_io_opts *opts) { struct bkey_s k = bkey_i_to_s(_k); struct bch_extent_rebalance *r; + unsigned target = opts->background_target; + unsigned compression = background_compression(*opts); bool needs_rebalance; if (!bkey_extent_is_direct_data(k.k)) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index a855c94d43dd..6bf839d69e84 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -708,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, - unsigned, unsigned); + struct bch_io_opts *); /* Generic extent code: */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h new file mode 100644 index 000000000000..3bd2fdbb0817 --- /dev/null +++ b/fs/bcachefs/extents_format.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_FORMAT_H +#define _BCACHEFS_EXTENTS_FORMAT_H + +/* + * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally + * preceded by checksum/compression information (bch_extent_crc32 or + * bch_extent_crc64). + * + * One major determining factor in the format of extents is how we handle and + * represent extents that have been partially overwritten and thus trimmed: + * + * If an extent is not checksummed or compressed, when the extent is trimmed we + * don't have to remember the extent we originally allocated and wrote: we can + * merely adjust ptr->offset to point to the start of the data that is currently + * live. The size field in struct bkey records the current (live) size of the + * extent, and is also used to mean "size of region on disk that we point to" in + * this case. + * + * Thus an extent that is not checksummed or compressed will consist only of a + * list of bch_extent_ptrs, with none of the fields in + * bch_extent_crc32/bch_extent_crc64. + * + * When an extent is checksummed or compressed, it's not possible to read only + * the data that is currently live: we have to read the entire extent that was + * originally written, and then return only the part of the extent that is + * currently live. + * + * Thus, in addition to the current size of the extent in struct bkey, we need + * to store the size of the originally allocated space - this is the + * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, + * when the extent is trimmed, instead of modifying the offset field of the + * pointer, we keep a second smaller offset field - "offset into the original + * extent of the currently live region". + * + * The other major determining factor is replication and data migration: + * + * Each pointer may have its own bch_extent_crc32/64. When doing a replicated + * write, we will initially write all the replicas in the same format, with the + * same checksum type and compression format - however, when copygc runs later (or + * tiering/cache promotion, anything that moves data), it is not in general + * going to rewrite all the pointers at once - one of the replicas may be in a + * bucket on one device that has very little fragmentation while another lives + * in a bucket that has become heavily fragmented, and thus is being rewritten + * sooner than the rest. + * + * Thus it will only move a subset of the pointers (or in the case of + * tiering/cache promotion perhaps add a single pointer without dropping any + * current pointers), and if the extent has been partially overwritten it must + * write only the currently live portion (or copygc would not be able to reduce + * fragmentation!) - which necessitates a different bch_extent_crc format for + * the new pointer. + * + * But in the interests of space efficiency, we don't want to store one + * bch_extent_crc for each pointer if we don't have to. + * + * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and + * bch_extent_ptrs appended arbitrarily one after the other. We determine the + * type of a given entry with a scheme similar to utf8 (except we're encoding a + * type, not a size), encoding the type in the position of the first set bit: + * + * bch_extent_crc32 - 0b1 + * bch_extent_ptr - 0b10 + * bch_extent_crc64 - 0b100 + * + * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and + * bch_extent_crc64 is the least constrained). + * + * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, + * until the next bch_extent_crc32/64. + * + * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer + * is neither checksummed nor compressed. + */ + +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ + x(crc64, 2) \ + x(crc128, 3) \ + x(stripe_ptr, 4) \ + x(rebalance, 5) +#define BCH_EXTENT_ENTRY_MAX 6 + +enum bch_extent_entry_type { +#define x(f, n) BCH_EXTENT_ENTRY_##f = n, + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +/* Compressed/uncompressed size are stored biased by 1: */ +struct bch_extent_crc32 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 type:2, + _compressed_size:7, + _uncompressed_size:7, + offset:7, + _unused:1, + csum_type:4, + compression_type:4; + __u32 csum; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u32 csum; + __u32 compression_type:4, + csum_type:4, + _unused:1, + offset:7, + _uncompressed_size:7, + _compressed_size:7, + type:2; +#endif +} __packed __aligned(8); + +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 + +struct bch_extent_crc64 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:3, + _compressed_size:9, + _uncompressed_size:9, + offset:9, + nonce:10, + csum_type:4, + compression_type:4, + csum_hi:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_hi:16, + compression_type:4, + csum_type:4, + nonce:10, + offset:9, + _uncompressed_size:9, + _compressed_size:9, + type:3; +#endif + __u64 csum_lo; +} __packed __aligned(8); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) + +struct bch_extent_crc128 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:4, + _compressed_size:13, + _uncompressed_size:13, + offset:13, + nonce:13, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 compression_type:4, + csum_type:4, + nonce:13, + offset:13, + _uncompressed_size:13, + _compressed_size:13, + type:4; +#endif + struct bch_csum csum; +} __packed __aligned(8); + +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) + +/* + * @reservation - pointer hasn't been written to, just reserved + */ +struct bch_extent_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:1, + cached:1, + unused:1, + unwritten:1, + offset:44, /* 8 petabytes */ + dev:8, + gen:8; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 gen:8, + dev:8, + offset:44, + unwritten:1, + unused:1, + cached:1, + type:1; +#endif +} __packed __aligned(8); + +struct bch_extent_stripe_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + block:8, + redundancy:4, + idx:47; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:47, + redundancy:4, + block:8, + type:5; +#endif +}; + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:34, + compression:8, /* enum bch_compression_opt */ + target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 target:16, + compression:8, + unused:34, + type:6; +#endif +}; + +union bch_extent_entry { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 + unsigned long type; +#elif __BITS_PER_LONG == 32 + struct { + unsigned long pad; + unsigned long type; + }; +#else +#error edit for your odd byteorder. +#endif + +#define x(f, n) struct bch_extent_##f f; + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +struct bch_btree_ptr { + struct bch_val v; + + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +struct bch_btree_ptr_v2 { + struct bch_val v; + + __u64 mem_ptr; + __le64 seq; + __le16 sectors_written; + __le16 flags; + struct bpos min_key; + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + +struct bch_extent { + struct bch_val v; + + __u64 _data[0]; + union bch_extent_entry start[]; +} __packed __aligned(8); + +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ + ((sizeof(struct bch_extent_crc128) + \ + sizeof(struct bch_extent_ptr)) / sizeof(__u64)) + +/* Maximum possible size of an entire extent value: */ +#define BKEY_EXTENT_VAL_U64s_MAX \ + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + +/* * Maximum possible size of an entire extent, key + value: */ +#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +/* Btree pointers don't carry around checksums: */ +#define BKEY_BTREE_PTR_VAL_U64s_MAX \ + ((sizeof(struct bch_btree_ptr_v2) + \ + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) +#define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + +struct bch_reservation { + struct bch_val v; + + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; +} __packed __aligned(8); + +struct bch_inline_data { + struct bch_val v; + u8 data[]; +}; + +#endif /* _BCACHEFS_EXTENTS_FORMAT_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 9637f636e32d..b04750dbf870 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) } #define eytzinger1_for_each(_i, _size) \ - for ((_i) = eytzinger1_first((_size)); \ + for (unsigned (_i) = eytzinger1_first((_size)); \ (_i) != 0; \ (_i) = eytzinger1_next((_i), (_size))) @@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) } #define eytzinger0_for_each(_i, _size) \ - for ((_i) = eytzinger0_first((_size)); \ + for (unsigned (_i) = eytzinger0_first((_size)); \ (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 73c12e565af5..27710cdd5710 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -303,18 +303,6 @@ void bch2_readahead(struct readahead_control *ractl) darray_exit(&readpages_iter.folios); } -static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum, struct folio *folio) -{ - bch2_folio_create(folio, __GFP_NOFAIL); - - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0)); -} - static void bch2_read_single_folio_end_io(struct bio *bio) { complete(bio->bi_private); @@ -329,6 +317,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) int ret; DECLARE_COMPLETION_ONSTACK(done); + if (!bch2_folio_create(folio, GFP_KERNEL)) + return -ENOMEM; + bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), @@ -336,7 +327,11 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - __bchfs_readfolio(c, rbio, inode_inum(inode), folio); + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index fdd57c5785c9..33cb6da3a5ad 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -77,6 +77,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) bch2_inode_opts_get(&opts, c, &inode->ei_inode); + /* bios must be 512 byte aligned: */ + if ((offset|iter->count) & (SECTOR_SIZE - 1)) + return -EINVAL; + ret = min_t(loff_t, iter->count, max_t(loff_t, 0, i_size_read(&inode->v) - offset)); @@ -84,6 +88,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) return ret; shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); + if (shorten >= iter->count) + shorten = 0; iter->count -= shorten; bio = bio_alloc_bioset(NULL, diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index ff664fd0d8ef..d359aa9b33b8 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, } } -void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, - u64 start, u64 end) +int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + u64 *start, u64 end, + bool nonblocking) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t index = *start >> PAGE_SECTORS_SHIFT; pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; struct folio_batch fbatch; s64 i_sectors_delta = 0; - unsigned i, j; + int ret = 0; - if (end <= start) - return; + if (end <= *start) + return 0; folio_batch_init(&fbatch); while (filemap_get_folios(inode->v.i_mapping, &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { + for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; + + if (!nonblocking) + folio_lock(folio); + else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + ret = -EAGAIN; + break; + } + u64 folio_start = folio_sector(folio); u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; BUG_ON(end <= folio_start); - folio_lock(folio); - s = bch2_folio(folio); + *start = min(end, folio_end); + struct bch_folio *s = bch2_folio(folio); if (s) { + unsigned folio_offset = max(*start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) { + for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { i_sectors_delta -= s->s[j].state == SECTOR_dirty; bch2_folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); @@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, } bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + return ret; } static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 27f712ae37a6..8cbaba6565b4 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); -void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); +int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool); int bch2_get_folio_disk_reservation(struct bch_fs *, struct bch_inode_info *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 98bd5babab19..8c70123b6a0c 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, continue; bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, - REQ_OP_FLUSH, + REQ_OP_WRITE|REQ_PREFLUSH, GFP_KERNEL, &c->nocow_flush_bioset), struct nocow_flush, bio); @@ -675,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - drop_locks_do(trans, - (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); + if (bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, true)) + drop_locks_do(trans, + bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, false)); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 1cbc5807bc80..3dc8630ff9fe 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -337,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) create_flags |= BCH_CREATE_SNAPSHOT_RO; - /* why do we need this lock? */ - down_read(&c->vfs_sb->s_umount); - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) { + /* sync_inodes_sb enforce s_umount is locked */ + down_read(&c->vfs_sb->s_umount); sync_inodes_sb(c->vfs_sb); + up_read(&c->vfs_sb->s_umount); + } retry: if (arg.src_ptr) { error = user_path_at(arg.dirfd, @@ -425,8 +426,6 @@ err2: goto retry; } err1: - up_read(&c->vfs_sb->s_umount); - return error; } @@ -456,6 +455,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, if (IS_ERR(victim)) return PTR_ERR(victim); + dir = d_inode(path.dentry); if (victim->d_sb->s_fs_info != c) { ret = -EXDEV; goto err; @@ -464,14 +464,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ret = -ENOENT; goto err; } - dir = d_inode(path.dentry); ret = __bch2_unlink(dir, victim, true); if (!ret) { fsnotify_rmdir(dir, victim); d_delete(victim); } - inode_unlock(dir); err: + inode_unlock(dir); dput(victim); path_put(&path); return ret; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index ec419b8e2c43..77ae65542db9 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -435,7 +435,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, bch2_subvol_is_ro(c, inode->ei_subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) - return ret; + return bch2_err_class(ret); ihold(&inode->v); d_instantiate(dentry, &inode->v); @@ -487,8 +487,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; - return bch2_subvol_is_ro(c, dir->ei_subvol) ?: + int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: __bch2_unlink(vdir, dentry, false); + return bch2_err_class(ret); } static int bch2_symlink(struct mnt_idmap *idmap, @@ -523,7 +524,7 @@ static int bch2_symlink(struct mnt_idmap *idmap, return 0; err: iput(&inode->v); - return ret; + return bch2_err_class(ret); } static int bch2_mkdir(struct mnt_idmap *idmap, @@ -641,7 +642,7 @@ err: src_inode, dst_inode); - return ret; + return bch2_err_class(ret); } static void bch2_setattr_copy(struct mnt_idmap *idmap, diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 4f0ecd605675..6a760777bafb 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -119,22 +119,19 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr, if (!ret) *snapshot = iter.pos.snapshot; err: - bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot); bch2_trans_iter_exit(trans, &iter); return ret; } -static int __lookup_dirent(struct btree_trans *trans, +static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, - u64 *target, unsigned *type) + u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; struct bkey_s_c_dirent d; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0); + int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); if (ret) return ret; @@ -225,15 +222,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; - ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot); + u32 root_inode_snapshot = snapshot; + ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); bch_err_msg(c, ret, "looking up root inode"); if (ret) return ret; root_hash_info = bch2_hash_info_init(c, &root_inode); - ret = __lookup_dirent(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type); + ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type, snapshot); if (bch2_err_matches(ret, ENOENT)) goto create_lostfound; @@ -250,7 +248,10 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - return lookup_inode(trans, inum, lostfound, &snapshot); + ret = lookup_inode(trans, inum, lostfound, &snapshot); + bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", + inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); + return ret; create_lostfound: /* diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 37dce96f48ac..086f0090b03a 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -506,22 +506,33 @@ fsck_err: static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "mode=%o ", inode->bi_mode); + printbuf_indent_add(out, 2); + prt_printf(out, "mode=%o", inode->bi_mode); + prt_newline(out); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); prt_printf(out, " (%x)", inode->bi_flags); + prt_newline(out); - prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", - inode->bi_journal_seq, - inode->bi_size, - inode->bi_sectors, - inode->bi_version); + prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq); + prt_newline(out); + + prt_printf(out, "bi_size=%llu", inode->bi_size); + prt_newline(out); + + prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); + prt_newline(out); + + prt_newline(out); + prt_printf(out, "bi_version=%llu", inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, " "#_name "=%llu", (u64) inode->_name); + prt_printf(out, #_name "=%llu", (u64) inode->_name); \ + prt_newline(out); BCH_INODE_FIELDS_v3() #undef x + printbuf_indent_sub(out, 2); } void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) @@ -587,7 +598,7 @@ int bch2_trigger_inode(struct btree_trans *trans, } } - if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); @@ -597,7 +608,7 @@ int bch2_trigger_inode(struct btree_trans *trans, struct bch_fs *c = trans->c; percpu_down_read(&c->mark_lock); - this_cpu_add(c->usage_gc->nr_inodes, nr); + this_cpu_add(c->usage_gc->b.nr_inodes, nr); percpu_up_read(&c->mark_lock); } diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h new file mode 100644 index 000000000000..83d107331edf --- /dev/null +++ b/fs/bcachefs/inode_format.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_FORMAT_H +#define _BCACHEFS_INODE_FORMAT_H + +#define BLOCKDEV_INODE_MAX 4096 +#define BCACHEFS_ROOT_INO 4096 + +struct bch_inode { + struct bch_val v; + + __le64 bi_hash_seed; + __le32 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v2 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v3 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le64 bi_sectors; + __le64 bi_size; + __le64 bi_version; + __u8 fields[]; +} __packed __aligned(8); + +#define INODEv3_FIELDS_START_INITIAL 6 +#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) + +struct bch_inode_generation { + struct bch_val v; + + __le32 bi_generation; + __le32 pad; +} __packed __aligned(8); + +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + +#define BCH_INODE_FIELDS_v2() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) + +#define BCH_INODE_FIELDS_v3() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) \ + x(bi_nocow, 8) + +/* subset of BCH_INODE_FIELDS */ +#define BCH_INODE_OPTS() \ + x(data_checksum, 8) \ + x(compression, 8) \ + x(project, 32) \ + x(background_compression, 8) \ + x(data_replicas, 8) \ + x(promote_target, 16) \ + x(foreground_target, 16) \ + x(background_target, 16) \ + x(erasure_code, 16) \ + x(nocow, 8) + +enum inode_opt_id { +#define x(name, ...) \ + Inode_opt_##name, + BCH_INODE_OPTS() +#undef x + Inode_opt_nr, +}; + +#define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ + x(append, 2) \ + x(nodump, 3) \ + x(noatime, 4) \ + x(i_size_dirty, 5) \ + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) + +/* bits 20+ reserved for packed fields below: */ + +enum bch_inode_flags { +#define x(t, n) BCH_INODE_##t = 1U << n, + BCH_INODE_FLAGS() +#undef x +}; + +enum __bch_inode_flags { +#define x(t, n) __BCH_INODE_##t = n, + BCH_INODE_FLAGS() +#undef x +}; + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); +LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_FIELDS_START, + struct bch_inode_v3, bi_flags, 31, 36); +LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); + +#endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index ca6d5f516aa2..1baf78594cca 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -442,9 +442,7 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, copy, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 33c0e783d546..2c098ac017b3 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_bkey_set_needs_rebalance(c, sk.k, - op->opts.background_target, - op->opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, @@ -1447,10 +1445,11 @@ err: op->flags |= BCH_WRITE_DONE; if (ret < 0) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s(): error: %s", __func__, bch2_err_str(ret)); op->error = ret; break; } @@ -1565,6 +1564,7 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 8538ef34f62b..bc890776eb57 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -27,6 +27,47 @@ static const char * const bch2_journal_errors[] = { NULL }; +static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) +{ + union journal_res_state s = READ_ONCE(j->reservations); + unsigned i = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + i; + + prt_printf(out, "seq:"); + prt_tab(out); + prt_printf(out, "%llu", seq); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "refcount:"); + prt_tab(out); + prt_printf(out, "%u", journal_state_count(s, i)); + prt_newline(out); + + prt_printf(out, "size:"); + prt_tab(out); + prt_human_readable_u64(out, vstruct_bytes(buf->data)); + prt_newline(out); + + prt_printf(out, "expires"); + prt_tab(out); + prt_printf(out, "%li jiffies", buf->expires - jiffies); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) + bch2_journal_buf_to_text(out, j, seq); +} + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -156,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) * We don't close a journal_buf until the next journal_buf is finished writing, * and can be opened again - this also initializes the next journal_buf: */ -static void __journal_entry_close(struct journal *j, unsigned closed_val) +static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); @@ -185,7 +226,17 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - trace_journal_entry_close(c, vstruct_bytes(buf->data)); + if (trace_journal_entry_close_enabled() && trace) { + struct printbuf pbuf = PRINTBUF; + pbuf.atomic++; + + prt_str(&pbuf, "entry size: "); + prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); + prt_newline(&pbuf); + bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); + trace_journal_entry_close(c, pbuf.buf); + printbuf_exit(&pbuf); + } sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; @@ -225,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) void bch2_journal_halt(struct journal *j) { spin_lock(&j->lock); - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); if (!j->err_seq) j->err_seq = journal_cur_seq(j); journal_wake(j); @@ -239,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j) /* Don't close it yet if we already have a write in flight: */ if (ret) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); else if (nr_unwritten_journal_entries(j)) { struct journal_buf *buf = journal_cur_buf(j); @@ -406,7 +457,7 @@ static void journal_write_work(struct work_struct *work) if (delta > 0) mod_delayed_work(c->io_complete_wq, &j->write_work, delta); else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); unlock: spin_unlock(&j->lock); } @@ -463,13 +514,21 @@ retry: buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); ret = journal_entry_open(j); if (ret == JOURNAL_ERR_max_in_flight) { track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], &j->max_in_flight_start, true); - trace_and_count(c, journal_entry_full, c); + if (trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + } + count_event(c, journal_entry_full); } unlock: can_discard = j->can_discard; @@ -549,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j, /* * Not enough room in current journal entry, have to flush it: */ - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); } else { journal_cur_buf(j)->u64s_reserved += d; } @@ -606,7 +665,7 @@ recheck_need_open: struct journal_res res = { 0 }; if (journal_entry_is_open(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); spin_unlock(&j->lock); @@ -786,7 +845,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou if (buf->need_flush_to_write_buffer) { if (seq == journal_cur_seq(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); union journal_res_state s; s.v = atomic64_read_acquire(&j->reservations.counter); @@ -1339,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } prt_newline(out); - - for (u64 seq = journal_cur_seq(j); - seq >= journal_last_unwritten_seq(j); - --seq) { - unsigned i = seq & JOURNAL_BUF_MASK; - - prt_printf(out, "unwritten entry:"); - prt_tab(out); - prt_printf(out, "%llu", seq); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "refcount:"); - prt_tab(out); - prt_printf(out, "%u", journal_state_count(s, i)); - prt_newline(out); - - prt_printf(out, "sectors:"); - prt_tab(out); - prt_printf(out, "%u", j->buf[i].sectors); - prt_newline(out); - - prt_printf(out, "expires"); - prt_tab(out); - prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); - prt_newline(out); - - printbuf_indent_sub(out, 2); - } + prt_printf(out, "unwritten entries:"); + prt_newline(out); + bch2_journal_bufs_to_text(out, j); prt_printf(out, "replay done:\t\t%i\n", diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index b0f4dd491e12..47805193f18c 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -683,10 +683,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); for (i = 0; i < nr_types; i++) { - if (i < BCH_DATA_NR) - prt_printf(out, " %s", bch2_data_types[i]); - else - prt_printf(out, " (unknown data type %u)", i); + bch2_prt_data_type(out, i); prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", le64_to_cpu(u->d[i].buckets), le64_to_cpu(u->d[i].sectors), @@ -1481,6 +1478,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) c->opts.foreground_target; unsigned i, replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_need = min_t(unsigned, replicas_want, + READ_ONCE(c->opts.metadata_replicas_required)); rcu_read_lock(); retry: @@ -1529,7 +1528,7 @@ done: BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; + return replicas >= replicas_need ? 0 : -EROFS; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -1991,7 +1990,8 @@ CLOSURE_CALLBACK(bch2_journal_write) percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio_reset(bio, ca->disk_sb.bdev, + REQ_OP_WRITE|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; bio->bi_private = ca; closure_bio_submit(bio, cl); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 820d25e19e5f..c33dca641575 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -205,7 +205,7 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; - if (nr_online < c->opts.metadata_replicas_required) { + if (nr_online < metadata_replicas_required(c)) { ret = JOURNAL_ERR_insufficient_devices; goto out; } @@ -892,9 +892,11 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) journal_seq_pin(j, seq)->devs); seq++; - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); + if (replicas.e.nr_devs) { + spin_unlock(&j->lock); + ret = bch2_mark_replicas(c, &replicas.e); + spin_lock(&j->lock); + } } spin_unlock(&j->lock); err: diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h new file mode 100644 index 000000000000..6a4bf7129dba --- /dev/null +++ b/fs/bcachefs/logged_ops_format.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H +#define _BCACHEFS_LOGGED_OPS_FORMAT_H + +struct bch_logged_op_truncate { + struct bch_val v; + __le32 subvol; + __le32 pad; + __le64 inum; + __le64 new_i_size; +}; + +enum logged_op_finsert_state { + LOGGED_OP_FINSERT_start, + LOGGED_OP_FINSERT_shift_extents, + LOGGED_OP_FINSERT_finish, +}; + +struct bch_logged_op_finsert { + struct bch_val v; + __u8 state; + __u8 pad[3]; + __le32 subvol; + __le64 inum; + __le64 dst_offset; + __le64 src_offset; + __le64 pos; +}; + +#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */ diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index b2be565bb8f2..64df11ab422b 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -17,7 +17,7 @@ * Rust and rustc has issues with u128. */ -#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) +#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC) typedef struct { unsigned __int128 v; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 7a33319dcd16..bf68ea49447b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -6,9 +6,11 @@ #include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" +#include "btree_io.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" @@ -34,12 +36,46 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) +static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + prt_str(out, "rewrite ptrs:"); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str(out, "kill ptrs: "); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str(out, "target: "); + prt_tab(out); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str(out, "compression: "); + prt_tab(out); + bch2_compression_opt_to_text(out, background_compression(*io_opts)); + prt_newline(out); + + prt_str(out, "extra replicas: "); + prt_tab(out); + prt_u64(out, data_opts->extra_replicas); +} + +static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { if (trace_move_extent_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); trace_move_extent(c, buf.buf); printbuf_exit(&buf); } @@ -111,6 +147,15 @@ static void move_write(struct moving_io *io) return; } + if (trace_move_extent_write_enabled()) { + struct bch_fs *c = io->write.op.c; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); + trace_move_extent_write(c, buf.buf); + printbuf_exit(&buf); + } + closure_get(&io->write.ctxt->cl); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); @@ -241,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + trace_move_extent2(c, k, &io_opts, &data_opts); + if (ctxt->stats) ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); - trace_move_extent2(c, k); bch2_data_update_opts_normalize(k, &data_opts); @@ -759,6 +805,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (!b) goto next; + unsigned sectors = btree_ptr_sectors_written(&b->key); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); bch2_trans_iter_exit(trans, &iter); @@ -768,11 +816,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, goto err; if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, - c->opts.btree_node_size >> 9); + bch2_ratelimit_increment(ctxt->rate, sectors); if (ctxt->stats) { - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + atomic64_add(sectors, &ctxt->stats->sectors_seen); + atomic64_add(sectors, &ctxt->stats->sectors_moved); } } next: @@ -1083,9 +1130,9 @@ int bch2_data_job(struct bch_fs *c, void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) { - prt_printf(out, "%s: data type=%s pos=", - stats->name, - bch2_data_types[stats->data_type]); + prt_printf(out, "%s: data type==", stats->name); + bch2_prt_data_type(out, stats->data_type); + prt_str(out, " pos="); bch2_bbpos_to_text(out, stats->pos); prt_newline(out); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 8e6f230eac38..b1ed0b9a20d3 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = { NULL }; -const char * const bch2_compression_types[] = { +const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; @@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = { NULL }; -const char * const bch2_data_types[] = { +const char * const __bch2_data_types[] = { BCH_DATA_TYPES() NULL }; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 93a24fef4214..9a4b7faa3765 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; -extern const char * const bch2_compression_types[]; +extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; -extern const char * const bch2_data_types[]; +extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; extern const char * const bch2_jset_entry_types[]; extern const char * const bch2_fs_usage_types[]; @@ -564,6 +564,11 @@ struct bch_io_opts { #undef x }; +static inline unsigned background_compression(struct bch_io_opts opts) +{ + return opts.background_compression ?: opts.compression; +} + struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index accf246c3233..b27d22925929 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -56,6 +56,7 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) va_copy(args2, args); len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + va_end(args2); } while (len + 1 >= printbuf_remaining(out) && !bch2_printbuf_make_room(out, len + 1)); diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h new file mode 100644 index 000000000000..dc34347ef6c7 --- /dev/null +++ b/fs/bcachefs/quota_format.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_FORMAT_H +#define _BCACHEFS_QUOTA_FORMAT_H + +/* KEY_TYPE_quota: */ + +enum quota_types { + QTYP_USR = 0, + QTYP_GRP = 1, + QTYP_PRJ = 2, + QTYP_NR = 3, +}; + +enum quota_counters { + Q_SPC = 0, + Q_INO = 1, + Q_COUNTERS = 2, +}; + +struct bch_quota_counter { + __le64 hardlimit; + __le64 softlimit; +}; + +struct bch_quota { + struct bch_val v; + struct bch_quota_counter c[Q_COUNTERS]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_quota: */ + +struct bch_sb_quota_counter { + __le32 timelimit; + __le32 warnlimit; +}; + +struct bch_sb_quota_type { + __le64 flags; + struct bch_sb_quota_counter c[Q_COUNTERS]; +}; + +struct bch_sb_field_quota { + struct bch_sb_field field; + struct bch_sb_quota_type q[QTYP_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_QUOTA_FORMAT_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 95f46cb3b5bd..22d1017aa49b 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -177,8 +177,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, prt_str(&buf, "target="); bch2_target_to_text(&buf, c, r->target); prt_str(&buf, " compression="); - struct bch_compression_opt opt = __bch2_compression_decode(r->compression); - prt_str(&buf, bch2_compression_opts[opt.type]); + bch2_compression_opt_to_text(&buf, r->compression); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, k); @@ -254,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, if (k.k->p.inode) { target = io_opts->background_target; - compression = io_opts->background_compression ?: io_opts->compression; + compression = background_compression(*io_opts); } else { const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); target = r ? r->target : io_opts->background_target; - compression = r ? r->compression : - (io_opts->background_compression ?: io_opts->compression); + compression = r ? r->compression : background_compression(*io_opts); } data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); @@ -371,6 +369,7 @@ static int do_rebalance(struct moving_context *ctxt) !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && !atomic64_read(&r->scan_stats.sectors_seen)) { + bch2_moving_ctxt_flush_all(ctxt); bch2_trans_unlock_long(trans); rebalance_wait(c); } @@ -385,7 +384,6 @@ static int bch2_rebalance_thread(void *arg) struct bch_fs *c = arg; struct bch_fs_rebalance *r = &c->rebalance; struct moving_context ctxt; - int ret; set_freezable(); @@ -393,8 +391,7 @@ static int bch2_rebalance_thread(void *arg) writepoint_ptr(&c->rebalance_write_point), true); - while (!kthread_should_stop() && - !(ret = do_rebalance(&ctxt))) + while (!kthread_should_stop() && !do_rebalance(&ctxt)) ; bch2_moving_ctxt_exit(&ctxt); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 725214605a05..21e13bb4335b 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -280,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(u->v); break; case BCH_FS_USAGE_inodes: - c->usage_base->nr_inodes = le64_to_cpu(u->v); + c->usage_base->b.nr_inodes = le64_to_cpu(u->v); break; case BCH_FS_USAGE_key_version: atomic64_set(&c->key_version, @@ -577,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v) static bool check_version_upgrade(struct bch_fs *c) { - unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); unsigned latest_version = bcachefs_metadata_version_current; + unsigned latest_compatible = min(latest_version, + bch2_latest_compatible_version(c->sb.version)); unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; @@ -597,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c) new_version = latest_version; break; case BCH_VERSION_UPGRADE_none: - new_version = old_version; + new_version = min(old_version, latest_version); break; } } @@ -774,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!(c->opts.nochanges && c->opts.norecovery)) { + if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); bool write_sb = false; @@ -804,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (bch2_check_version_downgrade(c)) { struct printbuf buf = PRINTBUF; - prt_str(&buf, "Version downgrade required:\n"); + prt_str(&buf, "Version downgrade required:"); __le64 passes = ext->recovery_passes_required[0]; bch2_sb_set_downgrade(c, @@ -812,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c) BCH_VERSION_MINOR(c->sb.version)); passes = ext->recovery_passes_required[0] & ~passes; if (passes) { - prt_str(&buf, " running recovery passes: "); + prt_str(&buf, "\n running recovery passes: "); prt_bitflags(&buf, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index faa5d3670058..c47c66c2b394 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -292,10 +292,10 @@ static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *f } } -int bch2_trans_mark_reflink_v(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - unsigned flags) +int bch2_trigger_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) @@ -324,7 +324,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, min(datalen, 32U), d.v->data); } -int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, +int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, unsigned flags) @@ -486,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + if (dst_inum.inum < src_inum.inum) { + /* Avoid some lock cycle transaction restarts */ + ret = bch2_btree_iter_traverse(&dst_iter); + if (ret) + continue; + } + dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); bch2_btree_iter_set_pos(&src_iter, src_want); @@ -538,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: bch2_extent_update(trans, dst_inum, &dst_iter, new_dst.k, &disk_res, new_i_size, i_sectors_delta, diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 8ee778ec0022..4d8867289717 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -24,14 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, +int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ - .trigger = bch2_trans_mark_reflink_v, \ + .trigger = bch2_trigger_reflink_v, \ .min_val_size = 8, \ }) @@ -39,7 +39,7 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_indirect_inline_data(struct btree_trans *, +int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, unsigned); @@ -47,7 +47,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *, #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ .val_to_text = bch2_indirect_inline_data_to_text, \ - .trigger = bch2_trans_mark_indirect_inline_data, \ + .trigger = bch2_trigger_indirect_inline_data, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h new file mode 100644 index 000000000000..6772eebb1fc6 --- /dev/null +++ b/fs/bcachefs/reflink_format.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_FORMAT_H +#define _BCACHEFS_REFLINK_FORMAT_H + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of + * the original indirect extent, and then one of the fragments is + * outside the range we point to, we'd leak a refcount: so when creating + * reflink pointers, we need to store pad values to remember the full + * range we were taking a reference on. + */ + __le32 front_pad; + __le32 back_pad; +} __packed __aligned(8); + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[]; +} __packed __aligned(8); + +struct bch_indirect_inline_data { + struct bch_val v; + __le64 refcount; + u8 data[]; +}; + +#endif /* _BCACHEFS_REFLINK_FORMAT_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 92ba56ef1fc8..cc2672c12031 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -9,6 +9,12 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); +/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ +static int bch2_memcmp(const void *l, const void *r, size_t size) +{ + return memcmp(l, r, size); +} + /* Replicas tracking - in memory: */ static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) @@ -33,21 +39,16 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); + eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, struct bch_replicas_entry_v0 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u [", e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } @@ -55,15 +56,10 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out, void bch2_replicas_entry_to_text(struct printbuf *out, struct bch_replicas_entry_v1 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } @@ -831,7 +827,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, sort_cmp_size(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, - memcmp, NULL); + bch2_memcmp, NULL); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 9632f36f5f31..b6bf0ebe7e84 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -207,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = BCH_FS_USAGE_inodes; - u->v = cpu_to_le64(c->usage_base->nr_inodes); + u->v = cpu_to_le64(c->usage_base->b.nr_inodes); } { diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c index 02a996e06a64..7dc898761bb3 100644 --- a/fs/bcachefs/counters.c +++ b/fs/bcachefs/sb-counters.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "super-io.h" -#include "counters.h" +#include "sb-counters.h" /* BCH_SB_FIELD_counters */ diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h index 4778aa19bf34..81f8aec9fcb1 100644 --- a/fs/bcachefs/counters.h +++ b/fs/bcachefs/sb-counters.h @@ -1,11 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_COUNTERS_H -#define _BCACHEFS_COUNTERS_H +#ifndef _BCACHEFS_SB_COUNTERS_H +#define _BCACHEFS_SB_COUNTERS_H #include "bcachefs.h" #include "super-io.h" - int bch2_sb_counters_to_cpu(struct bch_fs *); int bch2_sb_counters_from_cpu(struct bch_fs *); @@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_counters; -#endif // _BCACHEFS_COUNTERS_H +#endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h new file mode 100644 index 000000000000..62ea478215d0 --- /dev/null +++ b/fs/bcachefs/sb-counters_format.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H +#define _BCACHEFS_SB_COUNTERS_FORMAT_H + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) \ + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) \ + x(bucket_alloc, 5) \ + x(bucket_alloc_fail, 6) \ + x(btree_cache_scan, 7) \ + x(btree_cache_reap, 8) \ + x(btree_cache_cannibalize, 9) \ + x(btree_cache_cannibalize_lock, 10) \ + x(btree_cache_cannibalize_lock_fail, 11) \ + x(btree_cache_cannibalize_unlock, 12) \ + x(btree_node_write, 13) \ + x(btree_node_read, 14) \ + x(btree_node_compact, 15) \ + x(btree_node_merge, 16) \ + x(btree_node_split, 17) \ + x(btree_node_rewrite, 18) \ + x(btree_node_alloc, 19) \ + x(btree_node_free, 20) \ + x(btree_node_set_root, 21) \ + x(btree_path_relock_fail, 22) \ + x(btree_path_upgrade_fail, 23) \ + x(btree_reserve_get_fail, 24) \ + x(journal_entry_full, 25) \ + x(journal_full, 26) \ + x(journal_reclaim_finish, 27) \ + x(journal_reclaim_start, 28) \ + x(journal_write, 29) \ + x(read_promote, 30) \ + x(read_bounce, 31) \ + x(read_split, 33) \ + x(read_retry, 32) \ + x(read_reuse_race, 34) \ + x(move_extent_read, 35) \ + x(move_extent_write, 36) \ + x(move_extent_finish, 37) \ + x(move_extent_fail, 38) \ + x(move_extent_start_fail, 39) \ + x(copygc, 40) \ + x(copygc_wait, 41) \ + x(gc_gens_end, 42) \ + x(gc_gens_start, 43) \ + x(trans_blocked_journal_reclaim, 44) \ + x(trans_restart_btree_node_reused, 45) \ + x(trans_restart_btree_node_split, 46) \ + x(trans_restart_fault_inject, 47) \ + x(trans_restart_iter_upgrade, 48) \ + x(trans_restart_journal_preres_get, 49) \ + x(trans_restart_journal_reclaim, 50) \ + x(trans_restart_journal_res_get, 51) \ + x(trans_restart_key_cache_key_realloced, 52) \ + x(trans_restart_key_cache_raced, 53) \ + x(trans_restart_mark_replicas, 54) \ + x(trans_restart_mem_realloced, 55) \ + x(trans_restart_memory_allocation_failure, 56) \ + x(trans_restart_relock, 57) \ + x(trans_restart_relock_after_fill, 58) \ + x(trans_restart_relock_key_cache_fill, 59) \ + x(trans_restart_relock_next_node, 60) \ + x(trans_restart_relock_parent_for_fill, 61) \ + x(trans_restart_relock_path, 62) \ + x(trans_restart_relock_path_intent, 63) \ + x(trans_restart_too_many_iters, 64) \ + x(trans_restart_traverse, 65) \ + x(trans_restart_upgrade, 66) \ + x(trans_restart_would_deadlock, 67) \ + x(trans_restart_would_deadlock_write, 68) \ + x(trans_restart_injected, 69) \ + x(trans_restart_key_cache_upgrade, 70) \ + x(trans_traverse_all, 71) \ + x(transaction_commit, 72) \ + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) \ + x(trans_restart_write_buffer_flush, 75) \ + x(trans_restart_split_race, 76) \ + x(write_buffer_flush_slowpath, 77) \ + x(write_buffer_flush_sync, 78) + +enum bch_persistent_counters { +#define x(t, n, ...) BCH_COUNTER_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_NR +}; + +struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[]; +}; + +#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index a44a238bf8b5..eff5ce18c69c 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -251,7 +251,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Data allowed:"); prt_tab(out); if (BCH_MEMBER_DATA_ALLOWED(&m)) - prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); + prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); @@ -259,7 +259,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Has data:"); prt_tab(out); if (data_have) - prt_bitflags(out, bch2_data_types, data_have); + prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); @@ -421,7 +421,7 @@ void bch2_dev_errors_reset(struct bch_dev *ca) m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); - m->errors_reset_time = ktime_get_real_seconds(); + m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds()); bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 56af937523ff..ac6ba04d5521 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -728,7 +728,7 @@ static int check_snapshot(struct btree_trans *trans, return 0; memset(&s, 0, sizeof(s)); - memcpy(&s, k.v, bkey_val_bytes(k.k)); + memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); id = le32_to_cpu(s.parent); if (id) { @@ -1053,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, n->v.subvol = cpu_to_le32(snapshot_subvols[i]); n->v.tree = cpu_to_le32(tree); n->v.depth = cpu_to_le32(depth); + n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + n->v.btime.hi = 0; for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); @@ -1681,5 +1683,5 @@ int bch2_snapshots_read(struct bch_fs *c) void bch2_fs_snapshots_exit(struct bch_fs *c) { - kfree(rcu_dereference_protected(c->snapshots, true)); + kvfree(rcu_dereference_protected(c->snapshots, true)); } diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h new file mode 100644 index 000000000000..aabcd3a74cd9 --- /dev/null +++ b/fs/bcachefs/snapshot_format.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H +#define _BCACHEFS_SNAPSHOT_FORMAT_H + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ + __le32 tree; + __le32 depth; + __le32 skip[3]; + bch_le128 btime; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + +/* + * Snapshot trees: + * + * The snapshot_trees btree gives us persistent indentifier for each tree of + * bch_snapshot nodes, and allow us to record and easily find the root/master + * subvolume that other snapshots were created from: + */ +struct bch_snapshot_tree { + struct bch_val v; + __le32 master_subvol; + __le32 root_snapshot; +}; + +#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 89fdb7c21134..fcaa5a888744 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -160,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s } static __always_inline int -bch2_hash_lookup(struct btree_trans *trans, +bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + unsigned flags, u32 snapshot) { struct bkey_s_c k; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), @@ -195,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans, } static __always_inline int +bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key, + unsigned flags) +{ + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); +} + +static __always_inline int bch2_hash_hole(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h new file mode 100644 index 000000000000..af79134b07d6 --- /dev/null +++ b/fs/bcachefs/subvolume_format.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H +#define _BCACHEFS_SUBVOLUME_FORMAT_H + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; + /* + * Snapshot subvolumes form a tree, separate from the snapshot nodes + * tree - if this subvolume is a snapshot, this is the ID of the + * subvolume it was created from: + * + * This is _not_ necessarily the subvolume of the directory containing + * this subvolume: + */ + __le32 parent; + __le32 pad; + bch_le128 otime; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + +#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 6d3db5cce5f6..bd64eb68e84a 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -2,7 +2,6 @@ #include "bcachefs.h" #include "checksum.h" -#include "counters.h" #include "disk_groups.h" #include "ec.h" #include "error.h" @@ -13,6 +12,7 @@ #include "replicas.h" #include "quota.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "sb-members.h" @@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb, void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev_handle)) - bdev_release(sb->bdev_handle); + if (!IS_ERR_OR_NULL(sb->s_bdev_file)) + fput(sb->s_bdev_file); kfree(sb->holder); kfree(sb->sb_name); @@ -704,22 +704,22 @@ retry: if (!opt_get(*opts, nochanges)) sb->mode |= BLK_OPEN_WRITE; - sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->bdev_handle) && - PTR_ERR(sb->bdev_handle) == -EACCES && + sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->s_bdev_file) && + PTR_ERR(sb->s_bdev_file) == -EACCES && opt_get(*opts, read_only)) { sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->bdev_handle)) + sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->s_bdev_file)) opt_set(*opts, nochanges, true); } - if (IS_ERR(sb->bdev_handle)) { - ret = PTR_ERR(sb->bdev_handle); - goto out; + if (IS_ERR(sb->s_bdev_file)) { + ret = PTR_ERR(sb->s_bdev_file); + goto err; } - sb->bdev = sb->bdev_handle->bdev; + sb->bdev = file_bdev(sb->s_bdev_file); ret = bch2_sb_realloc(sb, 0); if (ret) { @@ -1321,7 +1321,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "Superblock size:"); prt_tab(out); - prt_printf(out, "%zu", vstruct_bytes(sb)); + prt_units_u64(out, vstruct_bytes(sb)); + prt_str(out, "/"); + prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); prt_newline(out); prt_printf(out, "Clean:"); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 9dbc35940197..6b23e11825e6 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -23,7 +23,6 @@ #include "checksum.h" #include "clock.h" #include "compress.h" -#include "counters.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" @@ -49,6 +48,7 @@ #include "recovery.h" #include "replicas.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-errors.h" #include "sb-members.h" #include "snapshot.h" @@ -883,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - btree_bytes(c)) || + c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { @@ -1386,8 +1386,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) prt_bdevname(&name, ca->disk_sb.bdev); if (c->sb.nr_devices == 1) - strlcpy(c->name, name.buf, sizeof(c->name)); - strlcpy(ca->name, name.buf, sizeof(ca->name)); + strscpy(c->name, name.buf, sizeof(c->name)); + strscpy(ca->name, name.buf, sizeof(ca->name)); printbuf_exit(&name); @@ -1428,10 +1428,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ? c->opts.metadata_replicas - : c->opts.metadata_replicas_required, + : metadata_replicas_required(c), !(flags & BCH_FORCE_IF_DATA_DEGRADED) ? c->opts.data_replicas - : c->opts.data_replicas_required); + : data_replicas_required(c)); return nr_rw >= required; case BCH_MEMBER_STATE_failed: @@ -1625,7 +1625,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (data) { struct printbuf data_has = PRINTBUF; - prt_bitflags(&data_has, bch2_data_types, data); + prt_bitflags(&data_has, __bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); printbuf_exit(&data_has); ret = -EBUSY; diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 0e5a14fc8e7f..ec784d975f66 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -4,7 +4,7 @@ struct bch_sb_handle { struct bch_sb *sb; - struct bdev_handle *bdev_handle; + struct file *s_bdev_file; struct block_device *bdev; char *sb_name; struct bio *bio; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 8ed52319ff68..cee80c47feea 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -21,6 +21,7 @@ #include "btree_gc.h" #include "buckets.h" #include "clock.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "inode.h" @@ -247,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) mutex_lock(&c->btree_cache.lock); list_for_each_entry(b, &c->btree_cache.live, list) - ret += btree_bytes(c); + ret += btree_buf_bytes(b); mutex_unlock(&c->btree_cache.lock); return ret; @@ -330,7 +331,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c prt_newline(out); for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { - prt_str(out, bch2_compression_types[i]); + bch2_prt_compression_type(out, i); prt_tab(out); prt_human_readable_u64(out, s[i].sectors_compressed << 9); @@ -725,8 +726,10 @@ STORE(bch2_fs_opts_dir) bch2_opt_set_sb(c, opt, v); bch2_opt_set_by_id(&c->opts, id, v); - if ((id == Opt_background_target || - id == Opt_background_compression) && v) + if (v && + (id == Opt_background_target || + id == Opt_background_compression || + (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); ret = size; @@ -883,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) for (i = 1; i < BCH_DATA_NR; i++) prt_printf(out, "%-12s:%12llu\n", - bch2_data_types[i], + bch2_data_type_str(i), percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); } } @@ -908,7 +911,7 @@ SHOW(bch2_dev) } if (attr == &sysfs_has_data) { - prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); + prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca)); prt_char(out, '\n'); } diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index b1c867aa2b58..9220d7de10db 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -53,9 +53,9 @@ int bch2_run_thread_with_file(struct thread_with_file *thr, if (ret) goto err; - fd_install(fd, file); get_task_struct(thr->task); wake_up_process(thr->task); + fd_install(fd, file); return fd; err: if (fd >= 0) diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index c94876b3bb06..293b90d704fb 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -46,7 +46,7 @@ DECLARE_EVENT_CLASS(fs_str, __assign_str(str, str); ), - TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) + TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) ); DECLARE_EVENT_CLASS(trans_str, @@ -273,28 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full, TP_ARGS(c) ); -DEFINE_EVENT(bch_fs, journal_entry_full, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(fs_str, journal_entry_full, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(journal_entry_close, - TP_PROTO(struct bch_fs *c, unsigned bytes), - TP_ARGS(c, bytes), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, bytes ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->bytes = bytes; - ), - - TP_printk("%d,%d entry bytes %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->bytes) +DEFINE_EVENT(fs_str, journal_entry_close, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(bio, journal_write, @@ -542,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail, __entry->level = path->level; TRACE_BPOS_assign(pos, path->pos); - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); __entry->self_read_count = c.n[SIX_LOCK_read]; __entry->self_intent_count = c.n[SIX_LOCK_intent]; @@ -827,40 +813,28 @@ TRACE_EVENT(bucket_evacuate, ); DEFINE_EVENT(fs_str, move_extent, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_read, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_write, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_finish, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(move_extent_fail, - TP_PROTO(struct bch_fs *c, const char *msg), - TP_ARGS(c, msg), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __string(msg, msg ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __assign_str(msg, msg); - ), - - TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) +DEFINE_EVENT(fs_str, move_extent_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_start_fail, @@ -1039,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race, __entry->level = b->c.level; __entry->written = b->written; __entry->blocks = btree_blocks(trans->c); - __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); + __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b); ), TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", @@ -1146,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -struct get_locks_fail; - TRACE_EVENT(trans_restart_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1195,11 +1167,9 @@ TRACE_EVENT(trans_restart_upgrade, __entry->node_seq) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) +DEFINE_EVENT(trans_str, trans_restart_relock, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), + TP_ARGS(trans, caller_ip, str) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index c2ef7cddaa4f..3a32faa86b5c 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) +void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits) { while (nr_bits) prt_char(out, '0' + ((v >> --nr_bits) & 1)); } +void bch2_prt_u64_base2(struct printbuf *out, u64 v) +{ + bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); +} + void bch2_print_string_as_lines(const char *prefix, const char *lines) { const char *p; @@ -267,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) console_unlock(); } -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr) +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, + gfp_t gfp) { #ifdef CONFIG_STACKTRACE unsigned nr_entries = 0; - int ret = 0; stack->nr = 0; - ret = darray_make_room(stack, 32); + int ret = darray_make_room_gfp(stack, 32, gfp); if (ret) return ret; @@ -284,7 +289,7 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne do { nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); } while (nr_entries == stack->size && - !(ret = darray_make_room(stack, stack->size * 2))); + !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); stack->nr = nr_entries; up_read(&task->signal->exec_update_lock); @@ -303,10 +308,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) } } -int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr) +int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) { bch_stacktrace stack = { 0 }; - int ret = bch2_save_backtrace(&stack, task, skipnr + 1); + int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); bch2_prt_backtrace(out, &stack); darray_exit(&stack); @@ -413,14 +418,15 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, bch2_quantiles_update(&stats->quantiles, duration); } - if (time_after64(end, stats->last_event)) { + if (stats->last_event && time_after64(end, stats->last_event)) { freq = end - stats->last_event; mean_and_variance_update(&stats->freq_stats, freq); mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); stats->max_freq = max(stats->max_freq, freq); stats->min_freq = min(stats->min_freq, freq); - stats->last_event = end; } + + stats->last_event = end; } static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, @@ -1186,7 +1192,9 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret) { darray_init(ret); - char *dev_name = kstrdup(_dev_name, GFP_KERNEL), *s = dev_name; + char *dev_name, *s, *orig; + + dev_name = orig = kstrdup(_dev_name, GFP_KERNEL); if (!dev_name) return -ENOMEM; @@ -1201,10 +1209,10 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret) } } - kfree(dev_name); + kfree(orig); return 0; err: bch2_darray_str_exit(ret); - kfree(dev_name); + kfree(orig); return -ENOMEM; } diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index c75fc31915d3..b414736d59a5 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -342,14 +342,15 @@ bool bch2_is_zero(const void *, size_t); u64 bch2_read_flag_list(char *, const char * const[]); -void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); typedef DARRAY(unsigned long) bch_stacktrace; -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned); +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned); +int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) { diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 5a1858fb9879..9c0d2316031b 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -590,8 +590,9 @@ err: mutex_unlock(&inode->ei_update_lock); if (value && - (opt_id == Opt_background_compression || - opt_id == Opt_background_target)) + (opt_id == Opt_background_target || + opt_id == Opt_background_compression || + (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); return bch2_err_class(ret); diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h new file mode 100644 index 000000000000..e9f810539552 --- /dev/null +++ b/fs/bcachefs/xattr_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_XATTR_FORMAT_H +#define _BCACHEFS_XATTR_FORMAT_H + +#define KEY_TYPE_XATTR_INDEX_USER 0 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 +#define KEY_TYPE_XATTR_INDEX_SECURITY 4 + +struct bch_xattr { + struct bch_val v; + __u8 x_type; + __u8 x_name_len; + __le16 x_val_len; + __u8 x_name[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_XATTR_FORMAT_H */ diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a9be9ac99222..378d9103a207 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1455,6 +1455,7 @@ out: */ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) { + LIST_HEAD(retry_list); struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; @@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { + u64 used; int trimming; block_group = list_first_entry(&fs_info->unused_bgs, @@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) goto next; } + spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (block_group->reserved || block_group->pinned || - block_group->used || block_group->ro || + if (btrfs_is_block_group_used(block_group) || block_group->ro || list_is_singular(&block_group->list)) { /* * We want to bail if we made new allocations or have @@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } + + /* + * The block group may be unused but there may be space reserved + * accounting with the existence of that block group, that is, + * space_info->bytes_may_use was incremented by a task but no + * space was yet allocated from the block group by the task. + * That space may or may not be allocated, as we are generally + * pessimistic about space reservation for metadata as well as + * for data when using compression (as we reserve space based on + * the worst case, when data can't be compressed, and before + * actually attempting compression, before starting writeback). + * + * So check if the total space of the space_info minus the size + * of this block group is less than the used space of the + * space_info - if that's the case, then it means we have tasks + * that might be relying on the block group in order to allocate + * extents, and add back the block group to the unused list when + * we finish, so that we retry later in case no tasks ended up + * needing to allocate extents from the block group. + */ + used = btrfs_space_info_used(space_info, true); + if (space_info->total_bytes - block_group->length < used) { + /* + * Add a reference for the list, compensate for the ref + * drop under the "next" label for the + * fs_info->unused_bgs list. + */ + btrfs_get_block_group(block_group); + list_add_tail(&block_group->bg_list, &retry_list); + + trace_btrfs_skip_unused_block_group(block_group); + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); /* We don't want to force the issue, only flip if it's ok. */ ret = inc_block_group_ro(block_group, 0); @@ -1650,12 +1691,16 @@ next: btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } + list_splice_tail(&retry_list, &fs_info->unused_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); return; flip_async: btrfs_end_transaction(trans); + spin_lock(&fs_info->unused_bgs_lock); + list_splice_tail(&retry_list, &fs_info->unused_bgs); + spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_put_block_group(block_group); btrfs_discard_punt_unused_bgs_list(fs_info); @@ -2684,6 +2729,37 @@ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + + /* + * If the block group is still unused, add it to the list of + * unused block groups. The block group may have been created in + * order to satisfy a space reservation, in which case the + * extent allocation only happens later. But often we don't + * actually need to allocate space that we previously reserved, + * so the block group may become unused for a long time. For + * example for metadata we generally reserve space for a worst + * possible scenario, but then don't end up allocating all that + * space or none at all (due to no need to COW, extent buffers + * were already COWed in the current transaction and still + * unwritten, tree heights lower than the maximum possible + * height, etc). For data we generally reserve the axact amount + * of space we are going to allocate later, the exception is + * when using compression, as we must reserve space based on the + * uncompressed data size, because the compression is only done + * when writeback triggered and we don't know how much space we + * are actually going to need, so we reserve the uncompressed + * size because the data may be uncompressible in the worst case. + */ + if (ret == 0) { + bool used; + + spin_lock(&block_group->lock); + used = btrfs_is_block_group_used(block_group); + spin_unlock(&block_group->lock); + + if (!used) + btrfs_mark_bg_unused(block_group); + } } btrfs_trans_release_chunk_metadata(trans); } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index c4a1f01cc1c2..962b11983901 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) return (block_group->start + block_group->length); } +static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) +{ + lockdep_assert_held(&bg->lock); + + return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); +} + static inline bool btrfs_is_block_group_data_only( struct btrfs_block_group *block_group) { diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ceb5f586a2d5..1043a8142351 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -494,7 +494,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); - if (unlikely(block_rsv->size == 0)) + if (unlikely(btrfs_block_rsv_size(block_rsv) == 0)) goto try_reserve; again: ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index b0bd12b8652f..43a9a6b5a79f 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -101,4 +101,36 @@ static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv) return data_race(rsv->full); } +/* + * Get the reserved mount of a block reserve in a context where getting a stale + * value is acceptable, instead of accessing it directly and trigger data race + * warning from KCSAN. + */ +static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv) +{ + u64 ret; + + spin_lock(&rsv->lock); + ret = rsv->reserved; + spin_unlock(&rsv->lock); + + return ret; +} + +/* + * Get the size of a block reserve in a context where getting a stale value is + * acceptable, instead of accessing it directly and trigger data race warning + * from KCSAN. + */ +static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv) +{ + u64 ret; + + spin_lock(&rsv->lock); + ret = rsv->size; + spin_unlock(&rsv->lock); + + return ret; +} + #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 193168214eeb..68345f73d429 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + const u8 *data_in, struct page *dest_page, + unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: /* @@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * start_byte tells us the offset into the compressed data we're interested in */ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + unsigned long dest_pgoff, size_t srclen, size_t destlen) { + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); struct list_head *workspace; + const u32 sectorsize = fs_info->sectorsize; int ret; + /* + * The full destination page range should not exceed the page size. + * And the @destlen should not exceed sectorsize, as this is only called for + * inline file extents, which should not exceed sectorsize. + */ + ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + workspace = get_workspace(type, 0); ret = compression_decompress(type, workspace, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); put_workspace(type, workspace); return ret; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 93cc92974dee..afd7e50d073d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); @@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index c276b136ab63..5b0b64571418 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1046,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto add; /* Skip too large extent */ - if (range_len >= extent_thresh) + if (em->len >= extent_thresh) goto next; /* diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 2833e8ef4c09..acf9f4b6c044 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 reserve_size = 0; u64 qgroup_rsv_size = 0; - u64 csum_leaves; unsigned outstanding_extents; lockdep_assert_held(&inode->lock); @@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, outstanding_extents); reserve_size += btrfs_calc_metadata_size(fs_info, 1); } - csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, - inode->csum_bytes); - reserve_size += btrfs_calc_insert_metadata_size(fs_info, - csum_leaves); + if (!(inode->flags & BTRFS_INODE_NODATASUM)) { + u64 csum_leaves; + + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); + reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves); + } /* * For qgroup rsv, the calculation is very simple: * account one nodesize for each outstanding extent @@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); } -static void calc_inode_reservations(struct btrfs_fs_info *fs_info, +static void calc_inode_reservations(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, u64 *meta_reserve, u64 *qgroup_reserve) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 nr_extents = count_max_extents(fs_info, num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); + u64 csum_leaves; u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); + if (inode->flags & BTRFS_INODE_NODATASUM) + csum_leaves = 0; + else + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); + *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, nr_extents + csum_leaves); @@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * everything out and try again, which is bad. This way we just * over-reserve slightly, and clean up the mess when we are done. */ - calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, + calc_inode_reservations(inode, num_bytes, disk_num_bytes, &meta_reserve, &qgroup_reserve); ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, noflush); @@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, nr_extents = count_max_extents(fs_info, num_bytes); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += disk_num_bytes; + if (!(inode->flags & BTRFS_INODE_NODATASUM)) + inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&inode->lock); - inode->csum_bytes -= num_bytes; + if (!(inode->flags & BTRFS_INODE_NODATASUM)) + inode->csum_bytes -= num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1502d664c892..fb33027e5a4c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -246,7 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -257,13 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev_handle); + return PTR_ERR(bdev_file); } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); if (!btrfs_check_device_zone_type(fs_info, bdev)) { btrfs_err(fs_info, @@ -314,7 +314,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; - device->bdev_handle = bdev_handle; + device->bdev_file = bdev_file; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->dev_stats_valid = 1; @@ -335,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - bdev_release(bdev_handle); + fput(bdev_file); return ret; } @@ -725,6 +725,23 @@ leave: return ret; } +static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args) +{ + if (args->start.srcdevid == 0) { + if (memchr(args->start.srcdev_name, 0, + sizeof(args->start.srcdev_name)) == NULL) + return -ENAMETOOLONG; + } else { + args->start.srcdev_name[0] = 0; + } + + if (memchr(args->start.tgtdev_name, 0, + sizeof(args->start.tgtdev_name)) == NULL) + return -ENAMETOOLONG; + + return 0; +} + int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { @@ -737,10 +754,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, default: return -EINVAL; } - - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; + ret = btrfs_check_replace_dev_names(args); + if (ret < 0) + return ret; ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, args->start.srcdevid, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c6907d533fe8..c843563914ca 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1307,12 +1307,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) * * @objectid: root id * @anon_dev: preallocated anonymous block device number for new roots, - * pass 0 for new allocation. + * pass NULL for a new allocation. * @check_ref: whether to check root item references, If true, return -ENOENT * for orphan roots */ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev, + u64 objectid, dev_t *anon_dev, bool check_ref) { struct btrfs_root *root; @@ -1336,8 +1336,17 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { - /* Shouldn't get preallocated anon_dev for cached roots */ - ASSERT(!anon_dev); + /* + * Some other caller may have read out the newly inserted + * subvolume already (for things like backref walk etc). Not + * that common but still possible. In that case, we just need + * to free the anon_dev. + */ + if (unlikely(anon_dev && *anon_dev)) { + free_anon_bdev(*anon_dev); + *anon_dev = 0; + } + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); return ERR_PTR(-ENOENT); @@ -1357,7 +1366,7 @@ again: goto fail; } - ret = btrfs_init_fs_root(root, anon_dev); + ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0); if (ret) goto fail; @@ -1393,7 +1402,7 @@ fail: * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ - if (anon_dev) + if (anon_dev && *anon_dev) root->anon_dev = 0; btrfs_put_root(root); return ERR_PTR(ret); @@ -1409,7 +1418,7 @@ fail: struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref) { - return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); + return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref); } /* @@ -1417,11 +1426,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, * the anonymous block device id * * @objectid: tree objectid - * @anon_dev: if zero, allocate a new anonymous block device or use the - * parameter value + * @anon_dev: if NULL, allocate a new anonymous block device or use the + * parameter value if not NULL */ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev) + u64 objectid, dev_t *anon_dev) { return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 9413726b329b..eb3473d1c1ac 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -61,7 +61,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev); + u64 objectid, dev_t *anon_dev); struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396aba92c57..8e8cc1111277 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 bytes_left, end; u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); - if (WARN_ON(start != aligned_start)) { + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; @@ -4298,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, return 0; } +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotinically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); + } + + return 0; +} + static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4308,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - if (ffe_ctl->for_treelog) { - spin_lock(&fs_info->treelog_bg_lock); - if (fs_info->treelog_bg) - ffe_ctl->hint_byte = fs_info->treelog_bg; - spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { - spin_lock(&fs_info->relocation_bg_lock); - if (fs_info->data_reloc_bg) - ffe_ctl->hint_byte = fs_info->data_reloc_bg; - spin_unlock(&fs_info->relocation_bg_lock); - } - return 0; + return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cfd2967f04a2..8b4bef05e222 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2480,6 +2480,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, u64 offset, u64 phys, u64 len, u32 flags) { + u64 cache_end; int ret = 0; /* Set at the end of extent_fiemap(). */ @@ -2489,15 +2490,102 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, goto assign; /* - * Sanity check, extent_fiemap() should have ensured that new - * fiemap extent won't overlap with cached one. - * Not recoverable. + * When iterating the extents of the inode, at extent_fiemap(), we may + * find an extent that starts at an offset behind the end offset of the + * previous extent we processed. This happens if fiemap is called + * without FIEMAP_FLAG_SYNC and there are ordered extents completing + * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()). * - * NOTE: Physical address can overlap, due to compression + * For example we are in leaf X processing its last item, which is the + * file extent item for file range [512K, 1M[, and after + * btrfs_next_leaf() releases the path, there's an ordered extent that + * completes for the file range [768K, 2M[, and that results in trimming + * the file extent item so that it now corresponds to the file range + * [512K, 768K[ and a new file extent item is inserted for the file + * range [768K, 2M[, which may end up as the last item of leaf X or as + * the first item of the next leaf - in either case btrfs_next_leaf() + * will leave us with a path pointing to the new extent item, for the + * file range [768K, 2M[, since that's the first key that follows the + * last one we processed. So in order not to report overlapping extents + * to user space, we trim the length of the previously cached extent and + * emit it. + * + * Upon calling btrfs_next_leaf() we may also find an extent with an + * offset smaller than or equals to cache->offset, and this happens + * when we had a hole or prealloc extent with several delalloc ranges in + * it, but after btrfs_next_leaf() released the path, delalloc was + * flushed and the resulting ordered extents were completed, so we can + * now have found a file extent item for an offset that is smaller than + * or equals to what we have in cache->offset. We deal with this as + * described below. */ - if (cache->offset + cache->len > offset) { - WARN_ON(1); - return -EINVAL; + cache_end = cache->offset + cache->len; + if (cache_end > offset) { + if (offset == cache->offset) { + /* + * We cached a dealloc range (found in the io tree) for + * a hole or prealloc extent and we have now found a + * file extent item for the same offset. What we have + * now is more recent and up to date, so discard what + * we had in the cache and use what we have just found. + */ + goto assign; + } else if (offset > cache->offset) { + /* + * The extent range we previously found ends after the + * offset of the file extent item we found and that + * offset falls somewhere in the middle of that previous + * extent range. So adjust the range we previously found + * to end at the offset of the file extent item we have + * just found, since this extent is more up to date. + * Emit that adjusted range and cache the file extent + * item we have just found. This corresponds to the case + * where a previously found file extent item was split + * due to an ordered extent completing. + */ + cache->len = offset - cache->offset; + goto emit; + } else { + const u64 range_end = offset + len; + + /* + * The offset of the file extent item we have just found + * is behind the cached offset. This means we were + * processing a hole or prealloc extent for which we + * have found delalloc ranges (in the io tree), so what + * we have in the cache is the last delalloc range we + * found while the file extent item we found can be + * either for a whole delalloc range we previously + * emmitted or only a part of that range. + * + * We have two cases here: + * + * 1) The file extent item's range ends at or behind the + * cached extent's end. In this case just ignore the + * current file extent item because we don't want to + * overlap with previous ranges that may have been + * emmitted already; + * + * 2) The file extent item starts behind the currently + * cached extent but its end offset goes beyond the + * end offset of the cached extent. We don't want to + * overlap with a previous range that may have been + * emmitted already, so we emit the currently cached + * extent and then partially store the current file + * extent item's range in the cache, for the subrange + * going the cached extent's end to the end of the + * file extent item. + */ + if (range_end <= cache_end) + return 0; + + if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) + phys += cache_end - offset; + + offset = cache_end; + len = range_end - cache_end; + goto emit; + } } /* @@ -2517,6 +2605,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, return 0; } +emit: /* Not mergeable, need to submit cached one */ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, cache->len, cache->flags); @@ -2689,16 +2778,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode, * it beyond i_size. */ while (cur_offset < end && cur_offset < i_size) { + struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; u64 prealloc_start; + u64 lockstart; + u64 lockend; u64 prealloc_len = 0; bool delalloc; + lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize); + lockend = round_up(end, inode->root->fs_info->sectorsize); + + /* + * We are only locking for the delalloc range because that's the + * only thing that can change here. With fiemap we have a lock + * on the inode, so no buffered or direct writes can happen. + * + * However mmaps and normal page writeback will cause this to + * change arbitrarily. We have to lock the extent lock here to + * make sure that nobody messes with the tree while we're doing + * btrfs_find_delalloc_in_range. + */ + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, delalloc_cached_state, &delalloc_start, &delalloc_end); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) break; @@ -2866,15 +2973,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { const u64 ino = btrfs_ino(inode); - struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end; u64 prev_extent_end; - u64 lockstart; - u64 lockend; + u64 range_start; + u64 range_end; + const u64 sectorsize = inode->root->fs_info->sectorsize; bool stopped = false; int ret; @@ -2885,22 +2992,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - lockstart = round_down(start, inode->root->fs_info->sectorsize); - lockend = round_up(start + len, inode->root->fs_info->sectorsize); - prev_extent_end = lockstart; - - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + range_start = round_down(start, sectorsize); + range_end = round_up(start + len, sectorsize); + prev_extent_end = range_start; ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) - goto out_unlock; + goto out; btrfs_release_path(path); path->reada = READA_FORWARD; - ret = fiemap_search_slot(inode, path, lockstart); + ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* * No file extent item found, but we may have delalloc between @@ -2910,7 +3014,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto check_eof_delalloc; } - while (prev_extent_end < lockend) { + while (prev_extent_end < range_end) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *ei; struct btrfs_key key; @@ -2933,21 +3037,21 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, * The first iteration can leave us at an extent item that ends * before our range's start. Move to the next item. */ - if (extent_end <= lockstart) + if (extent_end <= range_start) goto next_item; backref_ctx->curr_leaf_bytenr = leaf->start; /* We have in implicit hole (NO_HOLES feature enabled). */ if (prev_extent_end < key.offset) { - const u64 range_end = min(key.offset, lockend) - 1; + const u64 hole_end = min(key.offset, range_end) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, 0, 0, 0, - prev_extent_end, range_end); + prev_extent_end, hole_end); if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */ stopped = true; @@ -2955,7 +3059,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } /* We've reached the end of the fiemap range, stop. */ - if (key.offset >= lockend) { + if (key.offset >= range_end) { stopped = true; break; } @@ -3003,7 +3107,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_gen, backref_ctx); if (ret < 0) - goto out_unlock; + goto out; else if (ret > 0) flags |= FIEMAP_EXTENT_SHARED; } @@ -3014,7 +3118,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */ stopped = true; @@ -3025,12 +3129,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, next_item: if (fatal_signal_pending(current)) { ret = -EINTR; - goto out_unlock; + goto out; } ret = fiemap_next_leaf_item(inode, path); if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* No more file extent items for this inode. */ break; @@ -3049,29 +3153,41 @@ check_eof_delalloc: btrfs_free_path(path); path = NULL; - if (!stopped && prev_extent_end < lockend) { + if (!stopped && prev_extent_end < range_end) { ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, - 0, 0, 0, prev_extent_end, lockend - 1); + 0, 0, 0, prev_extent_end, range_end - 1); if (ret < 0) - goto out_unlock; - prev_extent_end = lockend; + goto out; + prev_extent_end = range_end; } if (cache.cached && cache.offset + cache.len >= last_extent_end) { const u64 i_size = i_size_read(&inode->vfs_inode); if (prev_extent_end < i_size) { + struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; + u64 lockstart; + u64 lockend; bool delalloc; + lockstart = round_down(prev_extent_end, sectorsize); + lockend = round_up(i_size, sectorsize); + + /* + * See the comment in fiemap_process_hole as to why + * we're doing the locking here. + */ + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, prev_extent_end, i_size - 1, &delalloc_cached_state, &delalloc_start, &delalloc_end); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) cache.flags |= FIEMAP_EXTENT_LAST; } else { @@ -3080,10 +3196,6 @@ check_eof_delalloc: } ret = emit_last_fiemap_cache(fieinfo, &cache); - -out_unlock: - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 809b11472a80..4795738d5785 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3184,8 +3184,23 @@ out: unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* Drop extent maps for the part of the extent we didn't write. */ - btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + /* + * Drop extent maps for the part of the extent we didn't write. + * + * We have an exception here for the free_space_inode, this is + * because when we do btrfs_get_extent() on the free space inode + * we will search the commit root. If this is a new block group + * we won't find anything, and we will trip over the assert in + * writepage where we do ASSERT(em->block_start != + * EXTENT_MAP_HOLE). + * + * Theoretically we could also skip this for any NOCOW extent as + * we don't mess with the extent map tree in the NOCOW case, but + * for now simply skip this if we are the free space inode. + */ + if (!btrfs_is_free_space_inode(inode)) + btrfs_drop_extent_map_range(inode, unwritten_start, + end, false); /* * If the ordered extent had an IOERR or something else went @@ -4458,6 +4473,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) u64 root_flags; int ret; + down_write(&fs_info->subvol_sem); + /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit @@ -4469,25 +4486,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - down_write(&fs_info->subvol_sem); - ret = may_destroy_subvol(dest); if (ret) - goto out_up_write; + goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -4497,7 +4514,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) - goto out_up_write; + goto out_undead; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -4563,15 +4580,17 @@ out_end_trans: inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); +out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - } else { + } +out_up_write: + up_write(&fs_info->subvol_sem); + if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); @@ -7816,6 +7835,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); int ret; ret = fiemap_prep(inode, fieinfo, start, &len, 0); @@ -7841,7 +7861,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); + btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); + + /* + * We did an initial flush to avoid holding the inode's lock while + * triggering writeback and waiting for the completion of IO and ordered + * extents. Now after we locked the inode we do it again, because it's + * possible a new write may have happened in between those two steps. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); + if (ret) { + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + return ret; + } + } + + ret = extent_fiemap(btrfs_inode, fieinfo, start, len); + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + + return ret; } static int btrfs_writepages(struct address_space *mapping, @@ -10269,6 +10308,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) return -EINVAL; + /* + * Compressed extents should always have checksums, so error out if we + * have a NOCOW file or inode was created while mounted with NODATASUM. + */ + if (inode->flags & BTRFS_INODE_NODATASUM) + return -EINVAL; + orig_count = iov_iter_count(from); /* The extent size must be sane. */ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 41b479861b3c..9876ee27f069 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -721,7 +721,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, free_extent_buffer(leaf); leaf = NULL; - new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); + new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); @@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EOPNOTSUPP; } + if (btrfs_root_refs(&root->root_item) == 0) + return -ENOENT; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EFAULT; goto out; } + if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { + ret = -EOPNOTSUPP; + goto out; + } /* compression requires us to start the IO */ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; @@ -2691,7 +2698,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2728,7 +2735,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); btrfs_exclop_finish(fs_info); @@ -2742,8 +2749,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } err_drop: mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2756,7 +2763,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2783,15 +2790,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -3808,6 +3815,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } + if (sa->create && is_fstree(sa->qgroupid)) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1131d5a29d61..e43bc0fdc74e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; - char *kaddr; - unsigned long bytes; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) return -EUCLEAN; @@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } data_in += LZO_LEN; - out_len = PAGE_SIZE; + out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { pr_warn("BTRFS: decompress failed!\n"); @@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, goto out; } - if (out_len < start_byte) { + ASSERT(out_len <= sectorsize); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + /* Early end, considered as an error. */ + if (unlikely(out_len < destlen)) { ret = -EIO; - goto out; + memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); } - - /* - * the caller is already checking against PAGE_SIZE, but lets - * move this check closer to the memcpy/memset - */ - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_local_page(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - - /* - * btrfs_getblock is doing a zero on the tail of the page too, - * but this will cover anything missing from the decompressed - * data. - */ - if (bytes < destlen) - memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 63b426cc7798..5470e1cdf10c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1736,6 +1736,15 @@ out: return ret; } +static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) +{ + return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || + qgroup->excl > 0 || qgroup->excl_cmpr > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); +} + int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1755,6 +1764,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } + if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { + ret = -EBUSY; + goto out; + } + /* Check if there are no children of this qgroup */ if (!list_empty(&qgroup->members)) { ret = -EBUSY; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 6486f0d7e993..8c4fc98ca9ce 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: - if (ret) + if (ret) { + btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } return ret; } @@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) } } if (ret) { - btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a01807cbd4d4..0123d2728923 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1098,12 +1098,22 @@ out: static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + int num_sectors; + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); - bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1646,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct page *page = scrub_stripe_get_page(stripe, i); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + /* We're beyond the chunk boundary, no need to read anymore. */ + if (i >= nr_sectors) + break; + /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && @@ -1701,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1715,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); - /* Read the whole stripe. */ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; - for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + /* Read the whole range inside the chunk boundary. */ + for (unsigned int cur = 0; cur < nr_sectors; cur++) { + struct page *page = scrub_stripe_get_page(stripe, cur); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); int ret; - ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); /* We should have allocated enough bio vectors. */ - ASSERT(ret == PAGE_SIZE); + ASSERT(ret == fs_info->sectorsize); } atomic_inc(&stripe->pending_io); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e36550618e5..e48a063ef085 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6705,11 +6705,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret) goto out; } - if (sctx->cur_inode_last_extent < - sctx->cur_inode_size) { - ret = send_hole(sctx, sctx->cur_inode_size); - if (ret) + if (sctx->cur_inode_last_extent < sctx->cur_inode_size) { + ret = range_is_hole_in_parent(sctx, + sctx->cur_inode_last_extent, + sctx->cur_inode_size); + if (ret < 0) { goto out; + } else if (ret == 0) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret < 0) + goto out; + } else { + /* Range is already a hole, skip. */ + ret = 0; + } } } if (need_truncate) { @@ -8111,7 +8120,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { - ret = -EINVAL; + ret = -EOPNOTSUPP; goto out; } @@ -8205,8 +8214,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), - arg->clone_sources_count + 1, + sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1, + sizeof(*sctx->clone_roots), GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 571bb13587d5..3b54eb583474 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -856,7 +856,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { - u64 global_rsv_size = fs_info->global_block_rsv.reserved; + const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; u64 thresh; u64 used; @@ -956,8 +956,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) - used += fs_info->delayed_refs_rsv.reserved + - fs_info->delayed_block_rsv.reserved; + used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) + + btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv); else used += space_info->bytes_may_use - global_rsv_size; @@ -1173,7 +1173,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; - u64 global_rsv_size = global_rsv->reserved; + const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv); loops++; @@ -1185,9 +1185,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * assume it's tied up in delalloc reservations. */ block_rsv_size = global_rsv_size + - delayed_block_rsv->reserved + - delayed_refs_rsv->reserved + - trans_rsv->reserved; + btrfs_block_rsv_reserved(delayed_block_rsv) + + btrfs_block_rsv_reserved(delayed_refs_rsv) + + btrfs_block_rsv_reserved(trans_rsv); if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; @@ -1207,16 +1207,16 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) to_reclaim = delalloc_size; flush = FLUSH_DELALLOC; } else if (space_info->bytes_pinned > - (delayed_block_rsv->reserved + - delayed_refs_rsv->reserved)) { + (btrfs_block_rsv_reserved(delayed_block_rsv) + + btrfs_block_rsv_reserved(delayed_refs_rsv))) { to_reclaim = space_info->bytes_pinned; flush = COMMIT_TRANS; - } else if (delayed_block_rsv->reserved > - delayed_refs_rsv->reserved) { - to_reclaim = delayed_block_rsv->reserved; + } else if (btrfs_block_rsv_reserved(delayed_block_rsv) > + btrfs_block_rsv_reserved(delayed_refs_rsv)) { + to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv); flush = FLUSH_DELAYED_ITEMS_NR; } else { - to_reclaim = delayed_refs_rsv->reserved; + to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv); flush = FLUSH_DELAYED_REFS_NR; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 93511d54abf8..0e49dab8dad2 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - folio_start_writeback(folio); + if (!folio_test_writeback(folio)) + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 896acfda1789..101f786963d4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1457,6 +1457,14 @@ static int btrfs_reconfigure(struct fs_context *fc) btrfs_info_to_ctx(fs_info, &old_ctx); + /* + * This is our "bind mount" trick, we don't want to allow the user to do + * anything other than mount a different ro/rw and a different subvol, + * all of the mount options should be maintained. + */ + if (mount_reconfigure) + ctx->mount_opt = old_ctx.mount_opt; + sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5b3333ceef04..bf8e64c766b6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 *delayed_refs_bytes) { - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; - u64 extra_delayed_refs_bytes = 0; - u64 bytes; + u64 bytes = num_bytes + *delayed_refs_bytes; int ret; /* - * If there's a gap between the size of the delayed refs reserve and - * its reserved space, than some tasks have added delayed refs or bumped - * its size otherwise (due to block group creation or removal, or block - * group item update). Also try to allocate that gap in order to prevent - * using (and possibly abusing) the global reserve when committing the - * transaction. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL && - !btrfs_block_rsv_full(delayed_refs_rsv)) { - spin_lock(&delayed_refs_rsv->lock); - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) - extra_delayed_refs_bytes = delayed_refs_rsv->size - - delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - } - - bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes; - - /* * We want to reserve all the bytes we may need all at once, so we only * do 1 enospc flushing cycle per transaction start. */ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); - if (ret == 0) { - if (extra_delayed_refs_bytes > 0) - btrfs_migrate_to_delayed_refs_rsv(fs_info, - extra_delayed_refs_bytes); - return 0; - } - - if (extra_delayed_refs_bytes > 0) { - bytes -= extra_delayed_refs_bytes; - ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); - if (ret == 0) - return 0; - } /* * If we are an emergency flush, which can steal from the global block * reserve, then attempt to not reserve space for the delayed refs, as * we will consume space for them from the global block reserve. */ - if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { bytes -= *delayed_refs_bytes; *delayed_refs_bytes = 0; ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); @@ -1868,7 +1834,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); + pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); pending->snap = NULL; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 50fdc69fdddf..6eccf8496486 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", - ptr, inline_type, end); + ptr, btrfs_extent_inline_ref_size(inline_type), end); return -EUCLEAN; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c32497311d2..e180da4cc227 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -468,39 +468,39 @@ static noinline struct btrfs_fs_devices *find_fsid( static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, - int flush, struct bdev_handle **bdev_handle, + int flush, struct file **bdev_file, struct btrfs_super_block **disk_super) { struct block_device *bdev; int ret; - *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL); + *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); - if (IS_ERR(*bdev_handle)) { - ret = PTR_ERR(*bdev_handle); + if (IS_ERR(*bdev_file)) { + ret = PTR_ERR(*bdev_file); goto error; } - bdev = (*bdev_handle)->bdev; + bdev = file_bdev(*bdev_file); if (flush) sync_blockdev(bdev); ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { - bdev_release(*bdev_handle); + fput(*bdev_file); goto error; } invalidate_bdev(bdev); *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - bdev_release(*bdev_handle); + fput(*bdev_file); goto error; } return 0; error: - *bdev_handle = NULL; + *bdev_file = NULL; return ret; } @@ -643,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -654,7 +654,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) return ret; @@ -678,20 +678,20 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { - if (bdev_read_only(bdev_handle->bdev)) + if (bdev_read_only(file_bdev(bdev_file))) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(bdev_handle->bdev)) + if (!bdev_nonrot(file_bdev(bdev_file))) fs_devices->rotating = true; - if (bdev_max_discard_sectors(bdev_handle->bdev)) + if (bdev_max_discard_sectors(file_bdev(bdev_file))) fs_devices->discardable = true; - device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); fs_devices->open_devices++; @@ -706,7 +706,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file); return -EINVAL; } @@ -1015,10 +1015,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; - if (device->bdev_handle) { - bdev_release(device->bdev_handle); + if (device->bdev_file) { + fput(device->bdev_file); device->bdev = NULL; - device->bdev_handle = NULL; + device->bdev_file = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1063,7 +1063,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - bdev_release(device->bdev_handle); + fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1316,7 +1316,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; - struct bdev_handle *bdev_handle; + struct file *bdev_file; u64 bytenr, bytenr_orig; int ret; @@ -1339,18 +1339,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev_handle = bdev_open_by_path(path, flags, NULL, NULL); - if (IS_ERR(bdev_handle)) - return ERR_CAST(bdev_handle); + bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); bytenr_orig = btrfs_sb_offset(0); - ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr); + ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } - disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr, + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); @@ -1381,7 +1381,7 @@ free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - bdev_release(bdev_handle); + fput(bdev_file); return device; } @@ -2057,7 +2057,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct bdev_handle **bdev_handle) + struct file **bdev_file) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2166,7 +2166,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, btrfs_assign_next_active_device(device, NULL); - if (device->bdev_handle) { + if (device->bdev_file) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); @@ -2182,9 +2182,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and bdev_release() will pull in the ->open_mutex on - * the block device and it's dependencies. Instead just flush the - * device and let the caller do the final bdev_release. + * write lock, and fput() on the block device will pull in the + * ->open_mutex on the block device and it's dependencies. Instead + * just flush the device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_scratch_superblocks(fs_info, device->bdev, @@ -2195,7 +2195,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } } - *bdev_handle = device->bdev_handle; + *bdev_file = device->bdev_file; synchronize_rcu(); btrfs_free_device(device); @@ -2332,7 +2332,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, const char *path) { struct btrfs_super_block *disk_super; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int ret; if (!path || !path[0]) @@ -2350,7 +2350,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; @@ -2363,7 +2363,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file); return 0; } @@ -2583,7 +2583,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; @@ -2596,12 +2596,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); - if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) { + if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) { ret = -EINVAL; goto error; } @@ -2613,11 +2613,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - sync_blockdev(bdev_handle->bdev); + sync_blockdev(file_bdev(bdev_file)); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (device->bdev == bdev_handle->bdev) { + if (device->bdev == file_bdev(bdev_file)) { ret = -EEXIST; rcu_read_unlock(); goto error; @@ -2633,8 +2633,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } device->fs_info = fs_info; - device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; @@ -2817,7 +2817,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - bdev_release(bdev_handle); + fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); @@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53f87f398da7..a11854912d53 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -90,7 +90,7 @@ struct btrfs_device { u64 generation; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; struct btrfs_zoned_device_info *zone_info; @@ -661,7 +661,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct bdev_handle **bdev_handle); + struct file **bdev_file); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 36cf1f0e338e..8da66ea699e8 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -354,18 +354,13 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - unsigned long bytes_left; - unsigned long total_out = 0; - unsigned long pg_offset = 0; - - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes_left = destlen; + unsigned long to_copy; workspace->strm.next_in = data_in; workspace->strm.avail_in = srclen; @@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, return -EIO; } - while (bytes_left > 0) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - - buf_start = total_out; - total_out = workspace->strm.total_out; - - if (total_out == buf_start) { - ret = -EIO; - break; - } - - if (total_out <= start_byte) - goto next; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - (buf_offset % PAGE_SIZE)); - bytes = min(bytes, bytes_left); + /* + * Everything (in/out buf) should be at most one sector, there should + * be no need to switch any input/output buffer. + */ + ret = zlib_inflate(&workspace->strm, Z_FINISH); + to_copy = min(workspace->strm.total_out, destlen); + if (ret != Z_STREAM_END) + goto out; - memcpy_to_page(dest_page, pg_offset, - workspace->buf + buf_offset, bytes); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); - pg_offset += bytes; - bytes_left -= bytes; -next: - workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = workspace->buf_size; - } - - if (ret != Z_STREAM_END && bytes_left != 0) +out: + if (unlikely(to_copy != destlen)) { + pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", + to_copy, destlen); ret = -EIO; - else + } else { ret = 0; + } zlib_inflateEnd(&workspace->strm); - /* - * this should only happen if zlib returned fewer bytes than we - * expected. btrfs_get_block is responsible for zeroing from the - * end of the inline extent (destlen) to the end of the page - */ - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); - } + if (unlikely(to_copy < destlen)) + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5bd76813b23f..aea51fd850cd 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -824,11 +824,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + unsigned int nofs_flags; + ASSERT(sb_zone_is_full(reset)); + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - reset->start, reset->len, - GFP_NOFS); + reset->start, reset->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -974,11 +977,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) * explicit ZONE_FINISH is not necessary. */ if (zone->wp != zone->start + zone->capacity) { + unsigned int nofs_flags; int ret; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, - zone->len, GFP_NOFS); + zone->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; } @@ -996,11 +1002,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { + unsigned int nofs_flags; sector_t zone_sectors; sector_t nr_sectors; u8 zone_sectors_shift; u32 sb_zone; u32 nr_zones; + int ret; zone_sectors = bdev_zone_sectors(bdev); zone_sectors_shift = ilog2(zone_sectors); @@ -1011,9 +1019,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) if (sb_zone + 1 >= nr_zones) return -ENOENT; - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - zone_start_sector(sb_zone, bdev), - zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zone_start_sector(sb_zone, bdev), + zone_sectors * BTRFS_NR_SB_LOG_ZONES); + memalloc_nofs_restore(nofs_flags); + return ret; } /* @@ -1124,12 +1135,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { + unsigned int nofs_flags; int ret; *bytes = 0; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, - GFP_NOFS); + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -1639,6 +1652,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: + /* Reject non SINGLE data profiles without RST */ + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && + (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", @@ -1670,6 +1692,7 @@ out: } bitmap_free(active); kfree(zone_info); + btrfs_free_chunk_map(map); return ret; } @@ -2055,6 +2078,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -2067,7 +2091,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; @@ -2087,20 +2110,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } - spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); @@ -2108,8 +2128,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* For the active block group list */ btrfs_get_block_group(block_group); - - spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -2117,6 +2135,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } @@ -2238,14 +2257,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; + unsigned int nofs_flags; if (zinfo->max_active_zones == 0) continue; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; diff --git a/fs/buffer.c b/fs/buffer.c index d3bcf601d3e5..4f73d23c2c46 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -55,7 +55,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * - * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the * management of a list of dependent buffers at ->i_mapping->i_private_list. * @@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1944,7 +1945,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio) } static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + enum rw_hint write_hint, struct writeback_control *wbc) { const enum req_op op = opf & REQ_OP_MASK; @@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_write_hint = write_hint; __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, void submit_bh(blk_opf_t opf, struct buffer_head *bh) { - submit_bh_wbc(opf, bh, NULL); + submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL); } EXPORT_SYMBOL(submit_bh); @@ -3121,12 +3125,8 @@ void __init buffer_init(void) unsigned long nrpages; int ret; - bh_cachep = kmem_cache_create("buffer_head", - sizeof(struct buffer_head), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), - NULL); - + bh_cachep = KMEM_CACHE(buffer_head, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC); /* * Limit the bh occupancy to 10% of ZONE_NORMAL */ diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index 8df715640a48..c5a070550ee3 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -2,7 +2,7 @@ config CACHEFILES tristate "Filesystem caching on files" - depends on FSCACHE && BLOCK + depends on NETFS_SUPPORT && FSCACHE && BLOCK help This permits use of a mounted filesystem as a cache for other filesystems - primarily networking filesystems - thus allowing fast diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 7077f72e6f47..f449f7340aad 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -168,6 +168,8 @@ error_unsupported: dput(root); error_open_root: cachefiles_end_secure(cache, saved_cred); + put_cred(cache->cache_cred); + cache->cache_cred = NULL; error_getsec: fscache_relinquish_cache(cache_cookie); cache->cache = NULL; diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 3f24905f4066..6465e2574230 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) cachefiles_put_directory(cache->graveyard); cachefiles_put_directory(cache->store); mntput(cache->mnt); + put_cred(cache->cache_cred); kfree(cache->rootdirname); kfree(cache->secctx); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 4a87c9d714a9..d33169f0018b 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -246,7 +246,7 @@ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres, enum fscache_want_state want_state); extern int __cachefiles_prepare_write(struct cachefiles_object *object, struct file *file, - loff_t *_start, size_t *_len, + loff_t *_start, size_t *_len, size_t upper_len, bool no_space_allocated_yet); extern int __cachefiles_write(struct cachefiles_object *object, struct file *file, diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 5857241c5918..1d685357e67f 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -517,18 +517,26 @@ cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres, */ int __cachefiles_prepare_write(struct cachefiles_object *object, struct file *file, - loff_t *_start, size_t *_len, + loff_t *_start, size_t *_len, size_t upper_len, bool no_space_allocated_yet) { struct cachefiles_cache *cache = object->volume->cache; loff_t start = *_start, pos; - size_t len = *_len, down; + size_t len = *_len; int ret; /* Round to DIO size */ - down = start - round_down(start, PAGE_SIZE); - *_start = start - down; - *_len = round_up(down + len, PAGE_SIZE); + start = round_down(*_start, PAGE_SIZE); + if (start != *_start || *_len > upper_len) { + /* Probably asked to cache a streaming write written into the + * pagecache when the cookie was temporarily out of service to + * culling. + */ + fscache_count_dio_misfit(); + return -ENOBUFS; + } + + *_len = round_up(len, PAGE_SIZE); /* We need to work out whether there's sufficient disk space to perform * the write - but we can skip that check if we have space already @@ -539,7 +547,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, pos = cachefiles_inject_read_error(); if (pos == 0) - pos = vfs_llseek(file, *_start, SEEK_DATA); + pos = vfs_llseek(file, start, SEEK_DATA); if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { if (pos == -ENXIO) goto check_space; /* Unallocated tail */ @@ -547,7 +555,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, cachefiles_trace_seek_error); return pos; } - if ((u64)pos >= (u64)*_start + *_len) + if ((u64)pos >= (u64)start + *_len) goto check_space; /* Unallocated region */ /* We have a block that's at least partially filled - if we're low on @@ -560,13 +568,13 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, pos = cachefiles_inject_read_error(); if (pos == 0) - pos = vfs_llseek(file, *_start, SEEK_HOLE); + pos = vfs_llseek(file, start, SEEK_HOLE); if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { trace_cachefiles_io_error(object, file_inode(file), pos, cachefiles_trace_seek_error); return pos; } - if ((u64)pos >= (u64)*_start + *_len) + if ((u64)pos >= (u64)start + *_len) return 0; /* Fully allocated */ /* Partially allocated, but insufficient space: cull. */ @@ -574,7 +582,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, ret = cachefiles_inject_remove_error(); if (ret == 0) ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - *_start, *_len); + start, *_len); if (ret < 0) { trace_cachefiles_io_error(object, file_inode(file), ret, cachefiles_trace_fallocate_error); @@ -591,8 +599,8 @@ check_space: } static int cachefiles_prepare_write(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size, - bool no_space_allocated_yet) + loff_t *_start, size_t *_len, size_t upper_len, + loff_t i_size, bool no_space_allocated_yet) { struct cachefiles_object *object = cachefiles_cres_object(cres); struct cachefiles_cache *cache = object->volume->cache; @@ -608,7 +616,7 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres, cachefiles_begin_secure(cache, &saved_cred); ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres), - _start, _len, + _start, _len, upper_len, no_space_allocated_yet); cachefiles_end_secure(cache, saved_cred); return ret; diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index b8fbbb1961bb..4ba42f1fa3b4 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -50,7 +50,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, return -ENOBUFS; cachefiles_begin_secure(cache, &saved_cred); - ret = __cachefiles_prepare_write(object, file, &pos, &len, true); + ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) return ret; @@ -539,6 +539,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) struct fscache_volume *volume = object->volume->vcookie; size_t volume_key_size, cookie_key_size, data_len; + if (!object->ondemand) + return 0; + /* * CacheFiles will firstly check the cache file under the root cache * directory. If the coherency check failed, it will fallback to diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 94df854147d3..7249d70e1a43 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -7,6 +7,7 @@ config CEPH_FS select CRYPTO_AES select CRYPTO select NETFS_SUPPORT + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION default n help Choose Y or M here to include support for mounting the diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 13af429ab030..1340d77124ae 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -159,27 +159,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, ceph_put_snap_context(snapc); } - folio_wait_fscache(folio); -} - -static bool ceph_release_folio(struct folio *folio, gfp_t gfp) -{ - struct inode *inode = folio->mapping->host; - struct ceph_client *cl = ceph_inode_to_client(inode); - - doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode), - folio->index, folio_test_dirty(folio) ? "" : "not "); - - if (folio_test_private(folio)) - return false; - - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - ceph_fscache_note_page_release(inode); - return true; + netfs_invalidate_folio(folio, offset, length); } static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) @@ -357,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) u64 len = subreq->len; bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); u64 off = subreq->start; + int extent_cnt; if (ceph_inode_is_shutdown(inode)) { err = -EIO; @@ -370,8 +351,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, - NULL, ci->i_truncate_seq, ci->i_truncate_size, false); + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; @@ -379,7 +360,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) } if (sparse) { - err = ceph_alloc_sparse_ext_map(&req->r_ops[0]); + extent_cnt = __ceph_sparse_read_ext_count(inode, len); + err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt); if (err) goto out; } @@ -509,7 +491,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, .free_request = ceph_netfs_free_request, - .begin_cache_operation = ceph_begin_cache_operation, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, @@ -1586,7 +1567,7 @@ const struct address_space_operations ceph_aops = { .write_end = ceph_write_end, .dirty_folio = ceph_dirty_folio, .invalidate_folio = ceph_invalidate_folio, - .release_folio = ceph_release_folio, + .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, }; diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index dc502daac49a..20efac020394 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -43,38 +43,19 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to) } } -static inline void ceph_fscache_unpin_writeback(struct inode *inode, +static inline int ceph_fscache_unpin_writeback(struct inode *inode, struct writeback_control *wbc) { - fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode))); + return netfs_unpin_writeback(inode, wbc); } -static inline int ceph_fscache_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - struct ceph_inode_info *ci = ceph_inode(mapping->host); - - return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci)); -} - -static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) -{ - struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); - - return fscache_begin_read_operation(&rreq->cache_resources, cookie); -} +#define ceph_fscache_dirty_folio netfs_dirty_folio static inline bool ceph_is_cache_enabled(struct inode *inode) { return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode))); } -static inline void ceph_fscache_note_page_release(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - - fscache_note_page_release(ceph_fscache_cookie(ci)); -} #else /* CONFIG_CEPH_FSCACHE */ static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) @@ -119,30 +100,18 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to) { } -static inline void ceph_fscache_unpin_writeback(struct inode *inode, - struct writeback_control *wbc) +static inline int ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) { + return 0; } -static inline int ceph_fscache_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - return filemap_dirty_folio(mapping, folio); -} +#define ceph_fscache_dirty_folio filemap_dirty_folio static inline bool ceph_is_cache_enabled(struct inode *inode) { return false; } - -static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) -{ - return -ENOBUFS; -} - -static inline void ceph_fscache_note_page_release(struct inode *inode) -{ -} #endif /* CONFIG_CEPH_FSCACHE */ #endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2c0b8dc3dd0d..7fb4aae97412 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1452,7 +1452,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); arg->xattr_version = ci->i_xattrs.version; - arg->xattr_buf = ci->i_xattrs.blob; + arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob); } else { arg->xattr_buf = NULL; arg->old_xattr_buf = NULL; @@ -1553,6 +1553,7 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) encode_cap_msg(msg, arg); ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); + ceph_buffer_put(arg->xattr_buf); if (arg->wake) wake_up_all(&ci->i_cap_wq); } @@ -2155,6 +2156,30 @@ retry: ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); + /* completed revocation? going down and there are no caps? */ + if (revoking) { + if ((revoking & cap_used) == 0) { + doutc(cl, "completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* + * If the "i_wrbuffer_ref" was increased by mmap or generic + * cache write just before the ceph_check_caps() is called, + * the Fb capability revoking will fail this time. Then we + * must wait for the BDI's delayed work to flush the dirty + * pages and to release the "i_wrbuffer_ref", which will cost + * at most 5 seconds. That means the MDS needs to wait at + * most 5 seconds to finished the Fb capability's revocation. + * + * Let's queue a writeback for it. + */ + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && + (revoking & CEPH_CAP_FILE_BUFFER)) + queue_writeback = true; + } + if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ @@ -2182,30 +2207,6 @@ retry: } } - /* completed revocation? going down and there are no caps? */ - if (revoking) { - if ((revoking & cap_used) == 0) { - doutc(cl, "completed revocation of %s\n", - ceph_cap_string(cap->implemented & ~cap->issued)); - goto ack; - } - - /* - * If the "i_wrbuffer_ref" was increased by mmap or generic - * cache write just before the ceph_check_caps() is called, - * the Fb capability revoking will fail this time. Then we - * must wait for the BDI's delayed work to flush the dirty - * pages and to release the "i_wrbuffer_ref", which will cost - * at most 5 seconds. That means the MDS needs to wait at - * most 5 seconds to finished the Fb capability's revocation. - * - * Let's queue a writeback for it. - */ - if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && - (revoking & CEPH_CAP_FILE_BUFFER)) - queue_writeback = true; - } - /* want more caps from mds? */ if (want & ~cap->mds_wanted) { if (want & ~(cap->mds_wanted | cap->issued)) @@ -3215,7 +3216,6 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, enum put_cap_refs_mode { PUT_CAP_REFS_SYNC = 0, - PUT_CAP_REFS_NO_CHECK, PUT_CAP_REFS_ASYNC, }; @@ -3331,11 +3331,6 @@ void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); } -void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) -{ - __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK); -} - /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. @@ -4777,7 +4772,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode) if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; - __cap_delay_requeue_front(mdsc, ci); + + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, + ceph_vinop(inode)); + spin_lock(&mdsc->cap_unlink_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add_tail(&ci->i_cap_delay_list, + &mdsc->cap_unlink_delay_list); + spin_unlock(&mdsc->cap_unlink_delay_lock); + + /* + * Fire the work immediately, because the MDS maybe + * waiting for caps release. + */ + ceph_queue_cap_unlink_work(mdsc); } } spin_unlock(&ci->i_ceph_lock); @@ -4887,13 +4897,15 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) { - struct dentry *parent = NULL; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_client *cl; int force = 0; int ret; + /* This shouldn't happen */ + BUG_ON(!dir); + /* * force an record for the directory caps if we have a dentry lease. * this is racy (can't take i_ceph_lock and d_lock together), but it @@ -4903,14 +4915,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; - if (!dir) { - parent = dget(dentry->d_parent); - dir = d_inode(parent); - } spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); - dput(parent); cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 678596684596..0e9f56eaba1e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1593,10 +1593,12 @@ struct ceph_lease_walk_control { unsigned long dir_lease_ttl; }; +static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *); +static int __dentry_lease_check(const struct dentry *); + static unsigned long __dentry_leases_walk(struct ceph_mds_client *mdsc, - struct ceph_lease_walk_control *lwc, - int (*check)(struct dentry*, void*)) + struct ceph_lease_walk_control *lwc) { struct ceph_dentry_info *di, *tmp; struct dentry *dentry, *last = NULL; @@ -1624,7 +1626,10 @@ __dentry_leases_walk(struct ceph_mds_client *mdsc, goto next; } - ret = check(dentry, lwc); + if (lwc->dir_lease) + ret = __dir_lease_check(dentry, lwc); + else + ret = __dentry_lease_check(dentry); if (ret & TOUCH) { /* move it into tail of dir lease list */ __dentry_dir_lease_touch(mdsc, di); @@ -1681,7 +1686,7 @@ next: return freed; } -static int __dentry_lease_check(struct dentry *dentry, void *arg) +static int __dentry_lease_check(const struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); int ret; @@ -1696,9 +1701,9 @@ static int __dentry_lease_check(struct dentry *dentry, void *arg) return DELETE; } -static int __dir_lease_check(struct dentry *dentry, void *arg) +static int __dir_lease_check(const struct dentry *dentry, + struct ceph_lease_walk_control *lwc) { - struct ceph_lease_walk_control *lwc = arg; struct ceph_dentry_info *di = ceph_dentry(dentry); int ret = __dir_lease_try_check(dentry); @@ -1737,7 +1742,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc) lwc.dir_lease = false; lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2; - freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check); + freed = __dentry_leases_walk(mdsc, &lwc); if (!lwc.nr_to_scan) /* more invalid leases */ return -EAGAIN; @@ -1747,7 +1752,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc) lwc.dir_lease = true; lwc.expire_dir_lease = freed < count; lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ; - freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check); + freed +=__dentry_leases_walk(mdsc, &lwc); if (!lwc.nr_to_scan) /* more to check */ return -EAGAIN; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 726af69d4d62..a79f163ae4ed 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -286,8 +286,6 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); } - if (IS_ERR(inode)) - return ERR_CAST(inode); /* see comments in ceph_get_parent() */ return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d380d9dad0e0..abe8028d95bf 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1029,6 +1029,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, struct ceph_osd_req_op *op; u64 read_off = off; u64 read_len = len; + int extent_cnt; /* determine new offset/length if encrypted */ ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len); @@ -1068,7 +1069,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, op = &req->r_ops[0]; if (sparse) { - ret = ceph_alloc_sparse_ext_map(op); + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { ceph_osdc_put_request(req); break; @@ -1465,6 +1467,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ssize_t len; struct ceph_osd_req_op *op; int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ; + int extent_cnt; if (write) size = min_t(u64, size, fsc->mount_options->wsize); @@ -1528,7 +1531,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); op = &req->r_ops[0]; if (sparse) { - ret = ceph_alloc_sparse_ext_map(op); + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { ceph_osdc_put_request(req); break; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0679240f06db..7b2e77517f23 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -78,6 +78,8 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + if (!S_ISLNK(*mode)) { err = ceph_pre_init_acls(dir, mode, as_ctx); if (err < 0) @@ -574,7 +576,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) doutc(fsc->client, "%p\n", &ci->netfs.inode); /* Set parameters for the netfs library */ - netfs_inode_init(&ci->netfs, &ceph_netfs_ops); + netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); spin_lock_init(&ci->i_ceph_lock); @@ -694,7 +696,7 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_FSCACHE_WB) + if (inode->i_state & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index e07ad29ff8b9..ebf4ac0055dd 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -33,7 +33,7 @@ void __init ceph_flock_init(void) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - struct inode *inode = file_inode(dst->fl_file); + struct inode *inode = file_inode(dst->c.flc_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); dst->fl_u.ceph.inode = igrab(inode); } @@ -110,17 +110,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, else length = fl->fl_end - fl->fl_start + 1; - owner = secure_addr(fl->fl_owner); + owner = secure_addr(fl->c.flc_owner); doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, " "start: %llu, length: %llu, wait: %d, type: %d\n", - (int)lock_type, (int)operation, owner, (u64)fl->fl_pid, - fl->fl_start, length, wait, fl->fl_type); + (int)lock_type, (int)operation, owner, + (u64) fl->c.flc_pid, + fl->fl_start, length, wait, fl->c.flc_type); req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; req->r_args.filelock_change.owner = cpu_to_le64(owner); - req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); + req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid); req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; @@ -130,13 +131,13 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, err = ceph_mdsc_wait_request(mdsc, req, wait ? ceph_lock_wait_for_completion : NULL); if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { - fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); + fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) - fl->fl_type = F_RDLCK; + fl->c.flc_type = F_RDLCK; else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; else - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + @@ -150,8 +151,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, ceph_mdsc_put_request(req); doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, " "length: %llu, wait: %d, type: %d, err code %d\n", - (int)lock_type, (int)operation, (u64)fl->fl_pid, - fl->fl_start, length, wait, fl->fl_type, err); + (int)lock_type, (int)operation, (u64) fl->c.flc_pid, + fl->fl_start, length, wait, fl->c.flc_type, err); return err; } @@ -227,10 +228,10 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, static int try_unlock_file(struct file *file, struct file_lock *fl) { int err; - unsigned int orig_flags = fl->fl_flags; - fl->fl_flags |= FL_EXISTS; + unsigned int orig_flags = fl->c.flc_flags; + fl->c.flc_flags |= FL_EXISTS; err = locks_lock_file_wait(file, fl); - fl->fl_flags = orig_flags; + fl->c.flc_flags = orig_flags; if (err == -ENOENT) { if (!(orig_flags & FL_EXISTS)) err = 0; @@ -253,13 +254,13 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) u8 wait = 0; u8 lock_cmd; - if (!(fl->fl_flags & FL_POSIX)) + if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; if (ceph_inode_is_shutdown(inode)) return -ESTALE; - doutc(cl, "fl_owner: %p\n", fl->fl_owner); + doutc(cl, "fl_owner: %p\n", fl->c.flc_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ if (IS_GETLK(cmd)) @@ -273,19 +274,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) } spin_unlock(&ci->i_ceph_lock); if (err < 0) { - if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) + if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) posix_lock_file(file, fl, NULL); return err; } - if (F_RDLCK == fl->fl_type) + if (lock_is_read(fl)) lock_cmd = CEPH_LOCK_SHARED; - else if (F_WRLCK == fl->fl_type) + else if (lock_is_write(fl)) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; - if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) { + if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) { err = try_unlock_file(file, fl); if (err <= 0) return err; @@ -293,7 +294,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { - if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) { + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) { doutc(cl, "locking locally\n"); err = posix_lock_file(file, fl, NULL); if (err) { @@ -319,13 +320,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) u8 wait = 0; u8 lock_cmd; - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if (ceph_inode_is_shutdown(inode)) return -ESTALE; - doutc(cl, "fl_file: %p\n", fl->fl_file); + doutc(cl, "fl_file: %p\n", fl->c.flc_file); spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { @@ -333,7 +334,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) } spin_unlock(&ci->i_ceph_lock); if (err < 0) { - if (F_UNLCK == fl->fl_type) + if (lock_is_unlock(fl)) locks_lock_file_wait(file, fl); return err; } @@ -341,14 +342,14 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (IS_SETLKW(cmd)) wait = 1; - if (F_RDLCK == fl->fl_type) + if (lock_is_read(fl)) lock_cmd = CEPH_LOCK_SHARED; - else if (F_WRLCK == fl->fl_type) + else if (lock_is_write(fl)) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; - if (F_UNLCK == fl->fl_type) { + if (lock_is_unlock(fl)) { err = try_unlock_file(file, fl); if (err <= 0) return err; @@ -356,7 +357,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, inode, lock_cmd, wait, fl); - if (!err && F_UNLCK != fl->fl_type) { + if (!err && F_UNLCK != fl->c.flc_type) { err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, @@ -385,9 +386,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) ctx = locks_inode_context(inode); if (ctx) { spin_lock(&ctx->flc_lock); - list_for_each_entry(lock, &ctx->flc_posix, fl_list) + for_each_file_lock(lock, &ctx->flc_posix) ++(*fcntl_count); - list_for_each_entry(lock, &ctx->flc_flock, fl_list) + for_each_file_lock(lock, &ctx->flc_flock) ++(*flock_count); spin_unlock(&ctx->flc_lock); } @@ -408,10 +409,10 @@ static int lock_to_ceph_filelock(struct inode *inode, cephlock->start = cpu_to_le64(lock->fl_start); cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64((u64)lock->fl_pid); - cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner)); - switch (lock->fl_type) { + switch (lock->c.flc_type) { case F_RDLCK: cephlock->type = CEPH_LOCK_SHARED; break; @@ -422,7 +423,8 @@ static int lock_to_ceph_filelock(struct inode *inode, cephlock->type = CEPH_LOCK_UNLOCK; break; default: - doutc(cl, "Have unknown lock type %d\n", lock->fl_type); + doutc(cl, "Have unknown lock type %d\n", + lock->c.flc_type); err = -EINVAL; } @@ -453,7 +455,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(lock, &ctx->flc_posix, fl_list) { + for_each_file_lock(lock, &ctx->flc_posix) { ++seen_fcntl; if (seen_fcntl > num_fcntl_locks) { err = -ENOSPC; @@ -464,7 +466,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, goto fail; ++l; } - list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + for_each_file_lock(lock, &ctx->flc_flock) { ++seen_flock; if (seen_flock > num_flock_locks) { err = -ENOSPC; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 02ebfabfc8ee..3ab9c268a8bb 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1089,7 +1089,7 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); - ceph_mdsc_release_dir_caps_no_check(req); + ceph_mdsc_release_dir_caps_async(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); @@ -1534,7 +1534,8 @@ static int encode_metric_spec(void **p, void *end) * session message, specialization for CEPH_SESSION_REQUEST_OPEN * to include additional client metadata fields. */ -static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq) +static struct ceph_msg * +create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; @@ -1578,6 +1579,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 size = METRIC_BYTES(count); extra_bytes += 2 + 4 + 4 + size; + /* flags, mds auth caps and oldest_client_tid */ + extra_bytes += 4 + 4 + 8; + /* Allocate the message */ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); @@ -1589,16 +1593,16 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 end = p + msg->front.iov_len; h = p; - h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN); + h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); /* * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map<string, string> * - * ClientSession messages with metadata are v4 + * ClientSession messages with metadata are v7 */ - msg->hdr.version = cpu_to_le16(4); + msg->hdr.version = cpu_to_le16(7); msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ @@ -1634,6 +1638,15 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 return ERR_PTR(ret); } + /* version == 5, flags */ + ceph_encode_32(&p, 0); + + /* version == 6, mds auth caps */ + ceph_encode_32(&p, 0); + + /* version == 7, oldest_client_tid */ + ceph_encode_64(&p, mdsc->oldest_tid); + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -1663,7 +1676,8 @@ static int __open_session(struct ceph_mds_client *mdsc, session->s_renew_requested = jiffies; /* send connect message */ - msg = create_session_open_msg(mdsc, session->s_seq); + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN, + session->s_seq); if (IS_ERR(msg)) return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); @@ -2028,10 +2042,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, doutc(cl, "to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); - msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); - if (!msg) - return -ENOMEM; + if (IS_ERR(msg)) + return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } @@ -2470,6 +2484,50 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) } } +void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc) +{ + struct ceph_client *cl = mdsc->fsc->client; + if (mdsc->stopping) + return; + + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) { + doutc(cl, "caps unlink work queued\n"); + } else { + doutc(cl, "failed to queue caps unlink work\n"); + } +} + +static void ceph_cap_unlink_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_unlink_work); + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "begin\n"); + spin_lock(&mdsc->cap_unlink_delay_lock); + while (!list_empty(&mdsc->cap_unlink_delay_list)) { + struct ceph_inode_info *ci; + struct inode *inode; + + ci = list_first_entry(&mdsc->cap_unlink_delay_list, + struct ceph_inode_info, + i_cap_delay_list); + list_del_init(&ci->i_cap_delay_list); + + inode = igrab(&ci->netfs.inode); + if (inode) { + spin_unlock(&mdsc->cap_unlink_delay_lock); + doutc(cl, "on %p %llx.%llx\n", inode, + ceph_vinop(inode)); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); + iput(inode); + spin_lock(&mdsc->cap_unlink_delay_lock); + } + } + spin_unlock(&mdsc->cap_unlink_delay_lock); + doutc(cl, "done\n"); +} + /* * requests */ @@ -4128,12 +4186,12 @@ static void handle_session(struct ceph_mds_session *session, pr_info_client(cl, "mds%d reconnect success\n", session->s_mds); + session->s_features = features; if (session->s_state == CEPH_MDS_SESSION_OPEN) { pr_notice_client(cl, "mds%d is already opened\n", session->s_mds); } else { session->s_state = CEPH_MDS_SESSION_OPEN; - session->s_features = features; renewed_caps(mdsc, session, 0); if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) @@ -4247,7 +4305,7 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) } } -void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) +void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req) { struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; @@ -4255,8 +4313,7 @@ void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); - ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent), - dcaps); + ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps); } } @@ -4292,7 +4349,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, if (req->r_session->s_mds != session->s_mds) continue; - ceph_mdsc_release_dir_caps_no_check(req); + ceph_mdsc_release_dir_caps_async(req); __send_request(session, req, true); } @@ -5346,6 +5403,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->cap_delay_list); INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); + INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); + spin_lock_init(&mdsc->cap_unlink_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; @@ -5354,6 +5413,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); + INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work); err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; @@ -5627,6 +5687,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) ceph_cleanup_global_and_empty_realms(mdsc); cancel_work_sync(&mdsc->cap_reclaim_work); + cancel_work_sync(&mdsc->cap_unlink_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ doutc(cl, "done\n"); @@ -5870,7 +5931,8 @@ static void mds_peer_reset(struct ceph_connection *con) pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n", s->s_mds); - if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO && + ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT) send_mds_reconnect(mdsc, s); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 2e6ddaa13d72..03f8ff00874f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -462,6 +462,8 @@ struct ceph_mds_client { unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ spinlock_t cap_delay_lock; /* protects cap_delay_list */ + struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ + spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; @@ -475,6 +477,8 @@ struct ceph_mds_client { struct work_struct cap_reclaim_work; atomic_t cap_reclaim_pending; + struct work_struct cap_unlink_work; + /* * Cap reservations * @@ -552,7 +556,7 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); -extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); @@ -574,6 +578,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); +extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc); extern int ceph_iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, int mds, void *), void *arg); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index fae97c25ce58..8109aba66e02 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -380,10 +380,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p, ceph_decode_skip_8(p, end, bad_ext); /* required_client_features */ ceph_decode_skip_set(p, end, 64, bad_ext); + /* bal_rank_mask */ + ceph_decode_skip_string(p, end, bad_ext); + } + if (mdsmap_ev >= 18) { ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); - } else { - /* This forces the usage of the (sync) SETXATTR Op */ - m->m_max_xattr_size = 0; } bad_ext: doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h index 89f1931f1ba6..1f2171dd01bf 100644 --- a/fs/ceph/mdsmap.h +++ b/fs/ceph/mdsmap.h @@ -27,7 +27,11 @@ struct ceph_mdsmap { u32 m_session_timeout; /* seconds */ u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; - u64 m_max_xattr_size; /* maximum size for xattrs blob */ + /* + * maximum size for xattrs blob. + * Zeroed by default to force the usage of the (sync) SETXATTR Op. + */ + u64 m_max_xattr_size; u32 m_max_mds; /* expected up:active mds number */ u32 m_num_active_mds; /* actual up:active mds number */ u32 possible_max_rank; /* possible max rank index */ diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 9d36c3532de1..06ee397e0c3a 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -197,10 +197,10 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) } /* - * This function walks through the snaprealm for an inode and returns the - * ceph_snap_realm for the first snaprealm that has quotas set (max_files, + * This function walks through the snaprealm for an inode and set the + * realmp with the first snaprealm that has quotas set (max_files, * max_bytes, or any, depending on the 'which_quota' argument). If the root is - * reached, return the root ceph_snap_realm instead. + * reached, set the realmp with the root ceph_snap_realm instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. @@ -211,10 +211,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) * this function will return -EAGAIN; otherwise, the snaprealms walk-through * will be restarted. */ -static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode, - enum quota_get_realm which_quota, - bool retry) +static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode, + enum quota_get_realm which_quota, + struct ceph_snap_realm **realmp, bool retry) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci = NULL; @@ -222,8 +221,10 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, struct inode *in; bool has_quota; + if (realmp) + *realmp = NULL; if (ceph_snap(inode) != CEPH_NOSNAP) - return NULL; + return 0; restart: realm = ceph_inode(inode)->i_snap_realm; @@ -250,7 +251,7 @@ restart: break; ceph_put_snap_realm(mdsc, realm); if (!retry) - return ERR_PTR(-EAGAIN); + return -EAGAIN; goto restart; } @@ -259,8 +260,11 @@ restart: iput(in); next = realm->parent; - if (has_quota || !next) - return realm; + if (has_quota || !next) { + if (realmp) + *realmp = realm; + return 0; + } ceph_get_snap_realm(mdsc, next); ceph_put_snap_realm(mdsc, realm); @@ -269,7 +273,7 @@ restart: if (realm) ceph_put_snap_realm(mdsc, realm); - return NULL; + return 0; } bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) @@ -277,6 +281,7 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); struct ceph_snap_realm *old_realm, *new_realm; bool is_same; + int ret; restart: /* @@ -286,9 +291,9 @@ restart: * dropped and we can then restart the whole operation. */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); - new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); - if (PTR_ERR(new_realm) == -EAGAIN) { + get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true); + ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false); + if (ret == -EAGAIN) { up_read(&mdsc->snap_rwsem); if (old_realm) ceph_put_snap_realm(mdsc, old_realm); @@ -492,8 +497,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), - QUOTA_GET_MAX_BYTES, true); + get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES, + &realm, true); up_read(&mdsc->snap_rwsem); if (!realm) return false; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fe0f64a0acb2..b63b4cd9b5b6 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -3,6 +3,7 @@ #define _FS_CEPH_SUPER_H #include <linux/ceph/ceph_debug.h> +#include <linux/ceph/osd_client.h> #include <asm/unaligned.h> #include <linux/backing-dev.h> @@ -1254,8 +1255,6 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had); -extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, - int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void __ceph_remove_capsnap(struct inode *inode, @@ -1407,6 +1406,19 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } +static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len) +{ + int cnt = 0; + + if (IS_ENCRYPTED(inode)) { + cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT; + if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL) + cnt = 0; + } + + return cnt; +} + extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 0c7c2528791e..a50356c541f6 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -24,6 +24,8 @@ #include <linux/pid_namespace.h> #include <linux/uaccess.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/vmalloc.h> #include <linux/coda.h> @@ -87,10 +89,10 @@ void coda_destroy_inodecache(void) kmem_cache_destroy(coda_inode_cachep); } -static int coda_remount(struct super_block *sb, int *flags, char *data) +static int coda_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - *flags |= SB_NOATIME; + sync_filesystem(fc->root->d_sb); + fc->sb_flags |= SB_NOATIME; return 0; } @@ -102,78 +104,102 @@ static const struct super_operations coda_super_operations = .evict_inode = coda_evict_inode, .put_super = coda_put_super, .statfs = coda_statfs, - .remount_fs = coda_remount, }; -static int get_device_index(struct coda_mount_data *data) +struct coda_fs_context { + int idx; +}; + +enum { + Opt_fd, +}; + +static const struct fs_parameter_spec coda_param_specs[] = { + fsparam_fd ("fd", Opt_fd), + {} +}; + +static int coda_parse_fd(struct fs_context *fc, int fd) { + struct coda_fs_context *ctx = fc->fs_private; struct fd f; struct inode *inode; int idx; - if (data == NULL) { - pr_warn("%s: Bad mount data\n", __func__); - return -1; - } - - if (data->version != CODA_MOUNT_VERSION) { - pr_warn("%s: Bad mount version\n", __func__); - return -1; - } - - f = fdget(data->fd); + f = fdget(fd); if (!f.file) - goto Ebadf; + return -EBADF; inode = file_inode(f.file); if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { fdput(f); - goto Ebadf; + return invalf(fc, "code: Not coda psdev"); } idx = iminor(inode); fdput(f); - if (idx < 0 || idx >= MAX_CODADEVS) { - pr_warn("%s: Bad minor number\n", __func__); - return -1; + if (idx < 0 || idx >= MAX_CODADEVS) + return invalf(fc, "coda: Bad minor number"); + ctx->idx = idx; + return 0; +} + +static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, coda_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_fd: + return coda_parse_fd(fc, result.uint_32); } - return idx; -Ebadf: - pr_warn("%s: Bad file\n", __func__); - return -1; + return 0; +} + +/* + * Parse coda's binary mount data form. We ignore any errors and go with index + * 0 if we get one for backward compatibility. + */ +static int coda_parse_monolithic(struct fs_context *fc, void *_data) +{ + struct coda_mount_data *data = _data; + + if (!data) + return invalf(fc, "coda: Bad mount data"); + + if (data->version != CODA_MOUNT_VERSION) + return invalf(fc, "coda: Bad mount version"); + + coda_parse_fd(fc, data->fd); + return 0; } -static int coda_fill_super(struct super_block *sb, void *data, int silent) +static int coda_fill_super(struct super_block *sb, struct fs_context *fc) { + struct coda_fs_context *ctx = fc->fs_private; struct inode *root = NULL; struct venus_comm *vc; struct CodaFid fid; int error; - int idx; - - if (task_active_pid_ns(current) != &init_pid_ns) - return -EINVAL; - - idx = get_device_index((struct coda_mount_data *) data); - /* Ignore errors in data, for backward compatibility */ - if(idx == -1) - idx = 0; - - pr_info("%s: device index: %i\n", __func__, idx); + infof(fc, "coda: device index: %i\n", ctx->idx); - vc = &coda_comms[idx]; + vc = &coda_comms[ctx->idx]; mutex_lock(&vc->vc_mutex); if (!vc->vc_inuse) { - pr_warn("%s: No pseudo device\n", __func__); + errorf(fc, "coda: No pseudo device"); error = -EINVAL; goto unlock_out; } if (vc->vc_sb) { - pr_warn("%s: Device already mounted\n", __func__); + errorf(fc, "coda: Device already mounted"); error = -EBUSY; goto unlock_out; } @@ -313,18 +339,45 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -/* init_coda: used by filesystems.c to register coda */ +static int coda_get_tree(struct fs_context *fc) +{ + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; -static struct dentry *coda_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) + return get_tree_nodev(fc, coda_fill_super); +} + +static void coda_free_fc(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, coda_fill_super); + kfree(fc->fs_private); +} + +static const struct fs_context_operations coda_context_ops = { + .free = coda_free_fc, + .parse_param = coda_parse_param, + .parse_monolithic = coda_parse_monolithic, + .get_tree = coda_get_tree, + .reconfigure = coda_reconfigure, +}; + +static int coda_init_fs_context(struct fs_context *fc) +{ + struct coda_fs_context *ctx; + + ctx = kzalloc(sizeof(struct coda_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &coda_context_ops; + return 0; } struct file_system_type coda_fs_type = { .owner = THIS_MODULE, .name = "coda", - .mount = coda_mount, + .init_fs_context = coda_init_fs_context, + .parameters = coda_param_specs, .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; diff --git a/fs/coredump.c b/fs/coredump.c index f258c17c1841..be6403b4b14b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -872,6 +872,9 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) loff_t pos; ssize_t n; + if (!page) + return 0; + if (cprm->to_skip) { if (!__dump_skip(cprm, cprm->to_skip)) return 0; @@ -884,7 +887,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) pos = file->f_pos; bvec_set_page(&bvec, page, PAGE_SIZE, 0); iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); - iov_iter_set_copy_mc(&iter); n = __kernel_write_iter(cprm->file, &iter, &pos); if (n != PAGE_SIZE) return 0; @@ -895,10 +897,44 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) return 1; } +/* + * If we might get machine checks from kernel accesses during the + * core dump, let's get those errors early rather than during the + * IO. This is not performance-critical enough to warrant having + * all the machine check logic in the iovec paths. + */ +#ifdef copy_mc_to_kernel + +#define dump_page_alloc() alloc_page(GFP_KERNEL) +#define dump_page_free(x) __free_page(x) +static struct page *dump_page_copy(struct page *src, struct page *dst) +{ + void *buf = kmap_local_page(src); + size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE); + kunmap_local(buf); + return left ? NULL : dst; +} + +#else + +/* We just want to return non-NULL; it's never used. */ +#define dump_page_alloc() ERR_PTR(-EINVAL) +#define dump_page_free(x) ((void)(x)) +static inline struct page *dump_page_copy(struct page *src, struct page *dst) +{ + return src; +} +#endif + int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len) { unsigned long addr; + struct page *dump_page; + + dump_page = dump_page_alloc(); + if (!dump_page) + return 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; @@ -912,14 +948,17 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - int stop = !dump_emit_page(cprm, page); + int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page)); put_page(page); - if (stop) + if (stop) { + dump_page_free(dump_page); return 0; + } } else { dump_skip(cprm, PAGE_SIZE); } } + dump_page_free(dump_page); return 1; } #endif diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 60dbfa0f8805..39e75131fd5a 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb) sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); - bdev_release(sb->s_bdev_handle); + fput(sb->s_bdev_file); } kfree(sbi); } diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 7b3fc189593a..0ad52fbe51c9 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -74,13 +74,7 @@ struct fscrypt_nokey_name { static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { - if (str->len == 1 && str->name[0] == '.') - return true; - - if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') - return true; - - return false; + return is_dot_dotdot(str->name, str->len); } /** diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 52504dd478d3..104771c3d3f6 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -102,11 +102,8 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, if (err && err != -ENOENT) return err; - if (fname->is_nokey_name) { - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_NOKEY_NAME; - spin_unlock(&dentry->d_lock); - } + fscrypt_prepare_dentry(dentry, fname->is_nokey_name); + return err; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); @@ -131,12 +128,10 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { int err = fscrypt_get_encryption_info(dir, true); + bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir)); + + fscrypt_prepare_dentry(dentry, is_nokey_name); - if (!err && !fscrypt_has_encryption_key(dir)) { - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_NOKEY_NAME; - spin_unlock(&dentry->d_lock); - } return err; } EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial); diff --git a/fs/dcache.c b/fs/dcache.c index b813528fb147..71a8e943a0fa 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3061,7 +3061,10 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; - dentry->d_lockref.count--; + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_lockref.count--; + } } return D_WALK_CONTINUE; } @@ -3136,7 +3139,7 @@ static void __init dcache_init(void) * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_iname); /* Hash may have been set up in dcache_init_early */ diff --git a/fs/direct-io.c b/fs/direct-io.c index 60456263a338..62c97ff9e852 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_io; if (dio->is_pinned) bio_set_flag(bio, BIO_PAGE_PINNED); + bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index d814c5121367..9ca83ef70ed1 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -138,14 +138,14 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, } op->info.optype = DLM_PLOCK_OP_LOCK; - op->info.pid = fl->fl_pid; - op->info.ex = (fl->fl_type == F_WRLCK); - op->info.wait = !!(fl->fl_flags & FL_SLEEP); + op->info.pid = fl->c.flc_pid; + op->info.ex = lock_is_write(fl); + op->info.wait = !!(fl->c.flc_flags & FL_SLEEP); op->info.fsid = ls->ls_global_id; op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - op->info.owner = (__u64)(long)fl->fl_owner; + op->info.owner = (__u64)(long) fl->c.flc_owner; /* async handling */ if (fl->fl_lmops && fl->fl_lmops->lm_grant) { op_data = kzalloc(sizeof(*op_data), GFP_NOFS); @@ -258,7 +258,7 @@ static int dlm_plock_callback(struct plock_op *op) } /* got fs lock; bookkeep locally as well: */ - flc->fl_flags &= ~FL_SLEEP; + flc->c.flc_flags &= ~FL_SLEEP; if (posix_lock_file(file, flc, NULL)) { /* * This can only happen in the case of kmalloc() failure. @@ -291,7 +291,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct dlm_ls *ls; struct plock_op *op; int rv; - unsigned char fl_flags = fl->fl_flags; + unsigned char saved_flags = fl->c.flc_flags; ls = dlm_find_lockspace_local(lockspace); if (!ls) @@ -304,7 +304,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, } /* cause the vfs unlock to return ENOENT if lock is not found */ - fl->fl_flags |= FL_EXISTS; + fl->c.flc_flags |= FL_EXISTS; rv = locks_lock_file_wait(file, fl); if (rv == -ENOENT) { @@ -317,14 +317,14 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, } op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = fl->fl_pid; + op->info.pid = fl->c.flc_pid; op->info.fsid = ls->ls_global_id; op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - op->info.owner = (__u64)(long)fl->fl_owner; + op->info.owner = (__u64)(long) fl->c.flc_owner; - if (fl->fl_flags & FL_CLOSE) { + if (fl->c.flc_flags & FL_CLOSE) { op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); rv = 0; @@ -345,7 +345,7 @@ out_free: dlm_release_plock_op(op); out: dlm_put_lockspace(ls); - fl->fl_flags = fl_flags; + fl->c.flc_flags = saved_flags; return rv; } EXPORT_SYMBOL_GPL(dlm_posix_unlock); @@ -375,14 +375,14 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, return -EINVAL; memset(&info, 0, sizeof(info)); - info.pid = fl->fl_pid; - info.ex = (fl->fl_type == F_WRLCK); + info.pid = fl->c.flc_pid; + info.ex = lock_is_write(fl); info.fsid = ls->ls_global_id; dlm_put_lockspace(ls); info.number = number; info.start = fl->fl_start; info.end = fl->fl_end; - info.owner = (__u64)(long)fl->fl_owner; + info.owner = (__u64)(long) fl->c.flc_owner; rv = do_lock_cancel(&info); switch (rv) { @@ -437,13 +437,13 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, } op->info.optype = DLM_PLOCK_OP_GET; - op->info.pid = fl->fl_pid; - op->info.ex = (fl->fl_type == F_WRLCK); + op->info.pid = fl->c.flc_pid; + op->info.ex = lock_is_write(fl); op->info.fsid = ls->ls_global_id; op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - op->info.owner = (__u64)(long)fl->fl_owner; + op->info.owner = (__u64)(long) fl->c.flc_owner; send_op(op); wait_event(recv_wq, (op->done != 0)); @@ -455,16 +455,16 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = op->info.rv; - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; if (rv == -ENOENT) rv = 0; else if (rv > 0) { locks_init_lock(fl); - fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; - fl->fl_flags = FL_POSIX; - fl->fl_pid = op->info.pid; + fl->c.flc_type = (op->info.ex) ? F_WRLCK : F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_pid = op->info.pid; if (op->info.nodeid != dlm_our_nodeid()) - fl->fl_pid = -fl->fl_pid; + fl->c.flc_pid = -fl->c.flc_pid; fl->fl_start = op->info.start; fl->fl_end = op->info.end; rv = 0; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 03bd55069d86..2fe0f3af1a08 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1949,16 +1949,6 @@ out: return rc; } -static bool is_dot_dotdot(const char *name, size_t name_size) -{ - if (name_size == 1 && name[0] == '.') - return true; - else if (name_size == 2 && name[0] == '.' && name[1] == '.') - return true; - - return false; -} - /** * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext * @plaintext_name: The plaintext name diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index 169252e6dc46..f7206158ee81 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -38,7 +38,7 @@ struct efivar_entry { int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, struct list_head *), - void *data, bool duplicates, struct list_head *head); + void *data, struct list_head *head); int efivar_entry_add(struct efivar_entry *entry, struct list_head *head); void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 6038dd39367a..bb14462f6d99 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -343,12 +343,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - err = efivar_init(efivarfs_callback, (void *)sb, true, - &sfi->efivarfs_list); - if (err) - efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL); - - return err; + return efivar_init(efivarfs_callback, sb, &sfi->efivarfs_list); } static int efivarfs_get_tree(struct fs_context *fc) diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index 114ff0fd4e55..4d722af1014f 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -361,7 +361,6 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, * efivar_init - build the initial list of EFI variables * @func: callback function to invoke for every variable * @data: function-specific data to pass to @func - * @duplicates: error if we encounter duplicates on @head? * @head: initialised head of variable list * * Get every EFI variable from the firmware and invoke @func. @func @@ -371,9 +370,9 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, */ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, struct list_head *), - void *data, bool duplicates, struct list_head *head) + void *data, struct list_head *head) { - unsigned long variable_name_size = 1024; + unsigned long variable_name_size = 512; efi_char16_t *variable_name; efi_status_t status; efi_guid_t vendor_guid; @@ -390,12 +389,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, goto free; /* - * Per EFI spec, the maximum storage allocated for both - * the variable name and variable data is 1024 bytes. + * A small set of old UEFI implementations reject sizes + * above a certain threshold, the lowest seen in the wild + * is 512. */ do { - variable_name_size = 1024; + variable_name_size = 512; status = efivar_get_next_variable(&variable_name_size, variable_name, @@ -413,8 +413,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, * we'll ever see a different variable name, * and may end up looping here forever. */ - if (duplicates && - variable_is_present(variable_name, &vendor_guid, + if (variable_is_present(variable_name, &vendor_guid, head)) { dup_variable_bug(variable_name, &vendor_guid, variable_name_size); @@ -432,9 +431,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, break; case EFI_NOT_FOUND: break; + case EFI_BUFFER_TOO_SMALL: + pr_warn("efivars: Variable name size exceeds maximum (%lu > 512)\n", + variable_name_size); + status = EFI_NOT_FOUND; + break; default: - printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n", - status); + pr_warn("efivars: get_next_variable: status=%lx\n", status); status = EFI_NOT_FOUND; break; } diff --git a/fs/efs/super.c b/fs/efs/super.c index f17fdac76b2e..e4421c10caeb 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -14,19 +14,14 @@ #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/blkdev.h> - +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "efs.h" #include <linux/efs_vh.h> #include <linux/efs_fs_sb.h> static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); -static int efs_fill_super(struct super_block *s, void *d, int silent); - -static struct dentry *efs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); -} +static int efs_init_fs_context(struct fs_context *fc); static void efs_kill_sb(struct super_block *s) { @@ -35,15 +30,6 @@ static void efs_kill_sb(struct super_block *s) kfree(sbi); } -static struct file_system_type efs_fs_type = { - .owner = THIS_MODULE, - .name = "efs", - .mount = efs_mount, - .kill_sb = efs_kill_sb, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("efs"); - static struct pt_types sgi_pt_types[] = { {0x00, "SGI vh"}, {0x01, "SGI trkrepl"}, @@ -63,6 +49,27 @@ static struct pt_types sgi_pt_types[] = { {0, NULL} }; +enum { + Opt_explicit_open, +}; + +static const struct fs_parameter_spec efs_param_spec[] = { + fsparam_flag ("explicit-open", Opt_explicit_open), + {} +}; + +/* + * File system definition and registration. + */ +static struct file_system_type efs_fs_type = { + .owner = THIS_MODULE, + .name = "efs", + .kill_sb = efs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = efs_init_fs_context, + .parameters = efs_param_spec, +}; +MODULE_ALIAS_FS("efs"); static struct kmem_cache * efs_inode_cachep; @@ -91,8 +98,8 @@ static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", sizeof(struct efs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; @@ -108,18 +115,10 @@ static void destroy_inodecache(void) kmem_cache_destroy(efs_inode_cachep); } -static int efs_remount(struct super_block *sb, int *flags, char *data) -{ - sync_filesystem(sb); - *flags |= SB_RDONLY; - return 0; -} - static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .free_inode = efs_free_inode, .statfs = efs_statfs, - .remount_fs = efs_remount, }; static const struct export_operations efs_export_ops = { @@ -249,26 +248,26 @@ static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) { return 0; } -static int efs_fill_super(struct super_block *s, void *d, int silent) +static int efs_fill_super(struct super_block *s, struct fs_context *fc) { struct efs_sb_info *sb; struct buffer_head *bh; struct inode *root; - sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); + sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) return -ENOMEM; s->s_fs_info = sb; s->s_time_min = 0; s->s_time_max = U32_MAX; - + s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); return -EINVAL; } - + /* read the vh (volume header) block */ bh = sb_bread(s, 0); @@ -294,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) pr_err("cannot read superblock\n"); return -EIO; } - + if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG pr_warn("invalid superblock at block %u\n", @@ -328,6 +327,61 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) return 0; } +static void efs_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static int efs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, efs_fill_super); +} + +static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + int token; + struct fs_parse_result result; + + token = fs_parse(fc, efs_param_spec, param, &result); + if (token < 0) + return token; + return 0; +} + +static int efs_reconfigure(struct fs_context *fc) +{ + sync_filesystem(fc->root->d_sb); + + return 0; +} + +struct efs_context { + unsigned long s_mount_opts; +}; + +static const struct fs_context_operations efs_context_opts = { + .parse_param = efs_parse_param, + .get_tree = efs_get_tree, + .reconfigure = efs_reconfigure, + .free = efs_free_fc, +}; + +/* + * Set up the filesystem mount context. + */ +static int efs_init_fs_context(struct fs_context *fc) +{ + struct efs_context *ctx; + + ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + fc->fs_private = ctx; + fc->ops = &efs_context_opts; + + return 0; +} + static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct efs_sb_info *sbi = SUPER_INFO(sb); diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 1d318f85232d..fffd3919343e 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -114,8 +114,11 @@ config EROFS_FS_ZIP_DEFLATE config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support" - depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) - default n + depends on EROFS_FS + select NETFS_SUPPORT + select FSCACHE + select CACHEFILES + select CACHEFILES_ONDEMAND help This permits EROFS to use fscache-backed data blobs with on-demand read support. diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 279933e007d2..7cc5841577b2 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,13 +11,12 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; - unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; - /* indicate the algorithm will be used for decompression */ - unsigned int alg; + unsigned int alg; /* the algorithm for decompression */ bool inplace_io, partial_decoding, fillgaps; + gfp_t gfp; /* allocation flags for extra temporary buffers */ }; struct z_erofs_decompressor { diff --git a/fs/erofs/data.c b/fs/erofs/data.c index c98aeda8abb2..52524bd9698b 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -220,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL; + map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; @@ -238,8 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - map->m_bdev = dif->bdev_handle ? - dif->bdev_handle->bdev : NULL; + map->m_bdev = dif->bdev_file ? + file_bdev(dif->bdev_file) : NULL; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; @@ -447,5 +447,6 @@ const struct file_operations erofs_file_fops = { .llseek = generic_file_llseek, .read_iter = erofs_file_read_iter, .mmap = erofs_file_mmap, + .get_unmapped_area = thp_get_unmapped_area, .splice_read = filemap_splice_read, }; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 1d65b9f60a39..2ec9b2bb628d 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + victim = erofs_allocpage(pagepool, rq->gfp); + if (!victim) + return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; @@ -322,7 +323,8 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; u8 *kin; - DBG_BUGON(rq->outputsize > rq->inputsize); + if (rq->outputsize > rq->inputsize) + return -EOPNOTSUPP; if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { cur = bs - (rq->pageofs_out & (bs - 1)); pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; @@ -408,7 +410,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) int size, ret = 0; if (!erofs_sb_has_compr_cfgs(sbi)) { - sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4; + sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; return z_erofs_load_lz4_config(sb, dsb, NULL, 0); } diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index 4a64a9c91dd3..b98872058abe 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb, } int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -158,8 +158,12 @@ again: strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); outsz -= strm->z.avail_out; if (!rq->out[no]) { - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + kout = NULL; + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -211,8 +215,11 @@ again: DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -230,7 +237,7 @@ again: break; } } - +failed: if (zlib_inflateEnd(&strm->z) != Z_OK && !err) err = -EIO; if (kout) diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 2dd14f99c1dc..6ca357d83cfa 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -148,7 +148,7 @@ again: } int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -215,8 +215,11 @@ again: PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; if (!rq->out[no] && rq->fillgaps) { /* deduped */ - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -258,8 +261,11 @@ again: DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -277,6 +283,7 @@ again: break; } } +failed: if (no < nrpages_out && strm->buf.out) kunmap(rq->out[no]); if (ni < nrpages_in) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 87ff35bff8d5..89a7c2453aae 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -165,10 +165,10 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; - struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; + struct erofs_fscache *ctx = folio->mapping->host->i_private; struct erofs_fscache_request *req; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); if (IS_ERR(req)) { folio_unlock(folio); @@ -276,7 +276,7 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio) struct erofs_fscache_request *req; int ret; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); if (IS_ERR(req)) { folio_unlock(folio); @@ -381,11 +381,12 @@ static int erofs_fscache_init_domain(struct super_block *sb) goto out; if (!erofs_pseudo_mnt) { - erofs_pseudo_mnt = kern_mount(&erofs_fs_type); - if (IS_ERR(erofs_pseudo_mnt)) { - err = PTR_ERR(erofs_pseudo_mnt); + struct vfsmount *mnt = kern_mount(&erofs_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); goto out; } + erofs_pseudo_mnt = mnt; } domain->volume = sbi->volume; @@ -459,7 +460,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); inode->i_blkbits = EROFS_SB(sb)->blkszbits; inode->i_private = ctx; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 3d616dea55dc..36e638e8b53a 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, } else { const unsigned int gotten = sb->s_blocksize - *ofs; - copied = kmalloc(vi->inode_isize, GFP_NOFS); + copied = kmalloc(vi->inode_isize, GFP_KERNEL); if (!copied) { err = -ENOMEM; goto err_out; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b0409badb017..0f0706325b7b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -49,7 +49,7 @@ typedef u32 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct dax_device *dax_dev; u64 dax_part_off; diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index d4f631d39f0f..f0110a78acb2 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -130,24 +130,24 @@ static void *erofs_find_target_block(struct erofs_buf *target, /* string comparison without already matched prefix */ diff = erofs_dirnamecmp(name, &dname, &matched); - if (!diff) { - *_ndirents = 0; - goto out; - } else if (diff > 0) { - head = mid + 1; - startprfx = matched; - - if (!IS_ERR(candidate)) - erofs_put_metabuf(target); - *target = buf; - candidate = de; - *_ndirents = ndirents; - } else { + if (diff < 0) { erofs_put_metabuf(&buf); - back = mid - 1; endprfx = matched; + continue; + } + + if (!IS_ERR(candidate)) + erofs_put_metabuf(target); + *target = buf; + if (!diff) { + *_ndirents = 0; + return de; } + head = mid + 1; + startprfx = matched; + candidate = de; + *_ndirents = ndirents; continue; } out: /* free if the candidate is valid */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 5f60f163bd56..9b4b66dcdd4f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -177,7 +177,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct bdev_handle *bdev_handle; + struct file *bdev_file; void *ptr; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); @@ -201,12 +201,12 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ, + bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ, sb->s_type, NULL); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); - dif->bdev_handle = bdev_handle; - dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); + dif->bdev_file = bdev_file; + dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file), &dif->dax_part_off, NULL, NULL); } @@ -754,8 +754,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev_handle) - bdev_release(dif->bdev_handle); + if (dif->bdev_file) + fput(dif->bdev_file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 5dea308764b4..e146d09151af 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, repeat: xa_lock(&sbi->managed_pslots); pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_NOFS); + NULL, grp, GFP_KERNEL); if (pre) { if (xa_is_err(pre)) { pre = ERR_PTR(xa_err(pre)); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 692c0c39be63..ff0aa72b0db3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -82,6 +82,9 @@ struct z_erofs_pcluster { /* L: indicate several pageofs_outs or not */ bool multibases; + /* L: whether extra buffer allocations are best-effort */ + bool besteffort; + /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; @@ -230,7 +233,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = erofs_allocpage(pagepool, GFP_NOFS); + nextpage = erofs_allocpage(pagepool, GFP_KERNEL); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -302,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) if (nrpages > pcs->maxpages) continue; - pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); pcl->pclustersize = size; @@ -563,21 +566,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; - if (i_blocksize(fe->inode) != PAGE_SIZE) - return; - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) + if (i_blocksize(fe->inode) != PAGE_SIZE || + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; for (i = 0; i < pclusterpages; ++i) { struct page *page, *newpage; void *t; /* mark pages just found for debugging */ - /* the compressed page was loaded before */ + /* Inaccurate check w/o locking to avoid unneeded lookups */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; page = find_get_page(mc, pcl->obj.index + i); - if (page) { t = (void *)((unsigned long)page | 1); newpage = NULL; @@ -597,9 +598,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); t = (void *)((unsigned long)newpage | 1); } - - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) + spin_lock(&pcl->obj.lockref.lock); + if (!pcl->compressed_bvecs[i].page) { + pcl->compressed_bvecs[i].page = t; + spin_unlock(&pcl->obj.lockref.lock); continue; + } + spin_unlock(&pcl->obj.lockref.lock); if (page) put_page(page); @@ -694,7 +699,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio, DBG_BUGON(stop > folio_size(folio) || stop < length); if (offset == 0 && stop == folio_size(folio)) - while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + while (!z_erofs_cache_release_folio(folio, 0)) cond_resched(); } @@ -713,36 +718,30 @@ int erofs_init_managed_cache(struct super_block *sb) set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; return 0; } -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct z_erofs_bvec *bvec) -{ - struct z_erofs_pcluster *const pcl = fe->pcl; - - while (fe->icur > 0) { - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, - NULL, bvec->page)) { - pcl->compressed_bvecs[fe->icur] = *bvec; - return true; - } - } - return false; -} - /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { + struct z_erofs_pcluster *pcl = fe->pcl; int ret; if (exclusive) { /* give priority for inplaceio to use file pages first */ - if (z_erofs_try_inplace_io(fe, bvec)) + spin_lock(&pcl->obj.lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->obj.lockref.lock); return 0; + } + spin_unlock(&pcl->obj.lockref.lock); + /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) @@ -964,7 +963,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct page *page, bool ra) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; @@ -1014,6 +1013,7 @@ repeat: err = z_erofs_pcluster_begin(fe); if (err) goto out; + fe->pcl->besteffort |= !ra; } /* @@ -1280,6 +1280,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = pcl->multibases, + .gfp = pcl->besteffort ? + GFP_KERNEL | __GFP_NOFAIL : + GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); /* must handle all compressed pages before actual file pages */ @@ -1322,6 +1325,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, pcl->length = 0; pcl->partial = true; pcl->multibases = false; + pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; @@ -1423,23 +1427,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, { gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; + struct z_erofs_bvec zbv; struct address_space *mapping; - struct page *page, *oldpage; + struct page *page; int justfound, bs = i_blocksize(f->inode); /* Except for inplace pages, the entire page can be used for I/Os */ bvec->bv_offset = 0; bvec->bv_len = PAGE_SIZE; repeat: - oldpage = READ_ONCE(zbv->page); - if (!oldpage) + spin_lock(&pcl->obj.lockref.lock); + zbv = pcl->compressed_bvecs[nr]; + page = zbv.page; + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); + pcl->compressed_bvecs[nr].page = page; + spin_unlock(&pcl->obj.lockref.lock); + if (!page) goto out_allocpage; - justfound = (unsigned long)oldpage & 1UL; - page = (struct page *)((unsigned long)oldpage & ~1UL); bvec->bv_page = page; - DBG_BUGON(z_erofs_is_shortlived_page(page)); /* * Handle preallocated cached pages. We tried to allocate such pages @@ -1448,7 +1455,6 @@ repeat: */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { set_page_private(page, 0); - WRITE_ONCE(zbv->page, page); tocache = true; goto out_tocache; } @@ -1459,9 +1465,9 @@ repeat: * therefore it is impossible for `mapping` to be NULL. */ if (mapping && mapping != mc) { - if (zbv->offset < 0) - bvec->bv_offset = round_up(-zbv->offset, bs); - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + if (zbv.offset < 0) + bvec->bv_offset = round_up(-zbv.offset, bs); + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; return; } @@ -1471,7 +1477,6 @@ repeat: /* the cached page is still in managed cache */ if (page->mapping == mc) { - WRITE_ONCE(zbv->page, page); /* * The cached page is still available but without a valid * `->private` pcluster hint. Let's reconnect them. @@ -1503,11 +1508,15 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + spin_lock(&pcl->obj.lockref.lock); + if (pcl->compressed_bvecs[nr].page) { erofs_pagepool_add(&f->pagepool, page); + spin_unlock(&pcl->obj.lockref.lock); cond_resched(); goto repeat; } + pcl->compressed_bvecs[nr].page = page; + spin_unlock(&pcl->obj.lockref.lock); bvec->bv_page = page; out_tocache: if (!tocache || bs != PAGE_SIZE || @@ -1685,6 +1694,7 @@ submit_bio_retry: if (cur + bvec.bv_len > end) bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset)) goto submit_bio_retry; @@ -1785,7 +1795,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) unlock_page(page); else - (void)z_erofs_do_read_page(f, page); + (void)z_erofs_do_read_page(f, page, !!rac); put_page(page); } @@ -1806,7 +1816,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_do_read_page(&f, &folio->page, false); z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); @@ -1847,7 +1857,7 @@ static void z_erofs_readahead(struct readahead_control *rac) folio = head; head = folio_get_private(folio); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_do_read_page(&f, &folio->page, true); if (err && err != -EINTR) erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", folio->index, EROFS_I(inode)->nid); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 9753875e41cb..e313c936351d 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -454,7 +454,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, .map = map, }; int err = 0; - unsigned int lclusterbits, endoff; + unsigned int lclusterbits, endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; @@ -543,17 +543,20 @@ static int z_erofs_do_map_blocks(struct inode *inode, err = -EFSCORRUPTED; goto unmap_out; } - if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) - map->m_algorithmformat = - Z_EROFS_COMPRESSION_INTERLACED; - else - map->m_algorithmformat = - Z_EROFS_COMPRESSION_SHIFTED; - } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { - map->m_algorithmformat = vi->z_algorithmtype[1]; + afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? + Z_EROFS_COMPRESSION_INTERLACED : + Z_EROFS_COMPRESSION_SHIFTED; } else { - map->m_algorithmformat = vi->z_algorithmtype[0]; + afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? + vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; + if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + afmt, vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } } + map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && diff --git a/fs/eventfd.c b/fs/eventfd.c index ad8186d47ba7..9afdb722fa92 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -251,7 +251,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c ssize_t res; __u64 ucnt; - if (count < sizeof(ucnt)) + if (count != sizeof(ucnt)) return -EINVAL; if (copy_from_user(&ucnt, buf, sizeof(ucnt))) return -EFAULT; @@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { struct eventfd_ctx *ctx = f->private_data; + __u64 cnt; spin_lock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-count: %16llx\n", - (unsigned long long)ctx->count); + cnt = ctx->count; spin_unlock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-id: %d\n", ctx->id); - seq_printf(m, "eventfd-semaphore: %d\n", + + seq_printf(m, + "eventfd-count: %16llx\n" + "eventfd-id: %d\n" + "eventfd-semaphore: %d\n", + cnt, + ctx->id, !!(ctx->flags & EFD_SEMAPHORE)); } #endif @@ -383,6 +388,7 @@ static int do_eventfd(unsigned int count, int flags) /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); + BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0)); if (flags & ~EFD_FLAGS_SET) return -EINVAL; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3534d36a1474..39ac6fdf8bca 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -206,7 +206,7 @@ struct eventpoll { */ struct epitem *ovflist; - /* wakeup_source used when ep_scan_ready_list is running */ + /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ @@ -678,12 +678,6 @@ static void ep_done_scan(struct eventpoll *ep, write_unlock_irq(&ep->lock); } -static void epi_rcu_free(struct rcu_head *head) -{ - struct epitem *epi = container_of(head, struct epitem, rcu); - kmem_cache_free(epi_cache, epi); -} - static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); @@ -767,7 +761,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ - call_rcu(&epi->rcu, epi_rcu_free); + kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); return ep_refcount_dec_and_test(ep); @@ -1153,7 +1147,7 @@ static inline bool chain_epi_lockless(struct epitem *epi) * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_scan_ready_list(), which stops all list modifications and guarantees + * ep_start/done_scan(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called @@ -1751,7 +1745,7 @@ static int ep_send_events(struct eventpoll *ep, * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by - * ep_scan_ready_list() holding "mtx" and the + * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1904,7 +1898,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, __set_current_state(TASK_INTERRUPTIBLE); /* - * Do the final check under the lock. ep_scan_ready_list() + * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is diff --git a/fs/exec.c b/fs/exec.c index 73e4045df271..ece3ab0998e1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -128,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { - .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, @@ -904,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ +/* + * On success, caller must call do_close_execat() on the returned + * struct file to close it. + */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; @@ -948,6 +952,17 @@ exit: return ERR_PTR(err); } +/** + * open_exec - Open a path name for execution + * + * @name: path name to open with the intent of executing it. + * + * Returns ERR_PTR on failure or allocated struct file on success. + * + * As this is a wrapper for the internal do_open_execat(), callers + * must call allow_write_access() before fput() on release. Also see + * do_close_execat(). + */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); @@ -1143,7 +1158,6 @@ static int de_thread(struct task_struct *tsk) BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; - /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees @@ -1409,6 +1423,9 @@ int begin_new_exec(struct linux_binprm * bprm) out_unlock: up_write(&me->signal->exec_update_lock); + if (!bprm->cred) + mutex_unlock(&me->signal->cred_guard_mutex); + out: return retval; } @@ -1484,6 +1501,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } +/* Matches do_open_execat() */ +static void do_close_execat(struct file *file) +{ + if (!file) + return; + allow_write_access(file); + fput(file); +} + static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { @@ -1495,10 +1521,7 @@ static void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } + do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ @@ -1508,12 +1531,23 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { - struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + struct linux_binprm *bprm; + struct file *file; int retval = -ENOMEM; - if (!bprm) - goto out; + + file = do_open_execat(fd, filename, flags); + if (IS_ERR(file)) + return ERR_CAST(file); + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) { + do_close_execat(file); + return ERR_PTR(-ENOMEM); + } + + bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; @@ -1526,18 +1560,28 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) if (!bprm->fdpath) goto out_free; + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. This allows the code in exec to + * choose to fail when the executable is not mmaped into the + * interpreter and an open file descriptor is not passed to + * the interpreter. This makes for a better user experience + * than having the interpreter start and then immediately fail + * when it finds the executable is inaccessible. + */ + if (get_close_on_exec(fd)) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); - if (retval) - goto out_free; - return bprm; + if (!retval) + return bprm; out_free: free_bprm(bprm); -out: return ERR_PTR(retval); } @@ -1588,6 +1632,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) } rcu_read_unlock(); + /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else @@ -1804,13 +1849,8 @@ static int exec_binprm(struct linux_binprm *bprm) return 0; } -/* - * sys_execve() executes a new program. - */ -static int bprm_execve(struct linux_binprm *bprm, - int fd, struct filename *filename, int flags) +static int bprm_execve(struct linux_binprm *bprm) { - struct file *file; int retval; retval = prepare_bprm_creds(bprm); @@ -1826,26 +1866,8 @@ static int bprm_execve(struct linux_binprm *bprm, current->in_execve = 1; sched_mm_cid_before_execve(current); - file = do_open_execat(fd, filename, flags); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_unmark; - sched_exec(); - bprm->file = file; - /* - * Record that a name derived from an O_CLOEXEC fd will be - * inaccessible after exec. This allows the code in exec to - * choose to fail when the executable is not mmaped into the - * interpreter and an open file descriptor is not passed to - * the interpreter. This makes for a better user experience - * than having the interpreter start and then immediately fail - * when it finds the executable is inaccessible. - */ - if (bprm->fdpath && get_close_on_exec(fd)) - bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; - /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) @@ -1875,7 +1897,6 @@ out: if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); -out_unmark: sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; @@ -1910,7 +1931,7 @@ static int do_execveat_common(int fd, struct filename *filename, * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -1959,7 +1980,7 @@ static int do_execveat_common(int fd, struct filename *filename, bprm->argc = 1; } - retval = bprm_execve(bprm, fd, filename, flags); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); @@ -1984,7 +2005,7 @@ int kernel_execve(const char *kernel_filename, if (IS_ERR(filename)) return PTR_ERR(filename); - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -2019,7 +2040,7 @@ int kernel_execve(const char *kernel_filename, if (retval < 0) goto out_free; - retval = bprm_execve(bprm, fd, filename, 0); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 9474cd50da6d..361595433480 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -275,6 +275,7 @@ struct exfat_sb_info { spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; + struct rcu_head rcu; }; #define EXFAT_CACHE_VALID 0 diff --git a/fs/exfat/file.c b/fs/exfat/file.c index d25a96a148af..cc00f1a7a1e1 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -35,13 +35,18 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) if (new_num_clusters == num_clusters) goto out; - exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags); - ret = exfat_find_last_cluster(sb, &clu, &last_clu); - if (ret) - return ret; + if (num_clusters) { + exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags); + ret = exfat_find_last_cluster(sb, &clu, &last_clu); + if (ret) + return ret; + + clu.dir = last_clu + 1; + } else { + last_clu = EXFAT_EOF_CLUSTER; + clu.dir = EXFAT_EOF_CLUSTER; + } - clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ? - EXFAT_EOF_CLUSTER : last_clu + 1; clu.size = 0; clu.flags = ei->flags; @@ -51,17 +56,19 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) return ret; /* Append new clusters to chain */ - if (clu.flags != ei->flags) { - exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters); - ei->flags = ALLOC_FAT_CHAIN; - } - if (clu.flags == ALLOC_FAT_CHAIN) - if (exfat_ent_set(sb, last_clu, clu.dir)) - goto free_clu; - - if (num_clusters == 0) + if (num_clusters) { + if (clu.flags != ei->flags) + if (exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters)) + goto free_clu; + + if (clu.flags == ALLOC_FAT_CHAIN) + if (exfat_ent_set(sb, last_clu, clu.dir)) + goto free_clu; + } else ei->start_clu = clu.dir; + ei->flags = clu.flags; + out: inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); /* Expanded range not zeroed, do not update valid_size */ diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 522edcbb2ce4..0687f952956c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -501,7 +501,7 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); loff_t pos = iocb->ki_pos; - loff_t size = iocb->ki_pos + iov_iter_count(iter); + loff_t size = pos + iov_iter_count(iter); int rw = iov_iter_rw(iter); ssize_t ret; @@ -525,11 +525,10 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); if (ret < 0) { - if (rw == WRITE) + if (rw == WRITE && ret != -EIOCBQUEUED) exfat_write_failed(mapping, size); - if (ret != -EIOCBQUEUED) - return ret; + return ret; } else size = pos + ret; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 705710f93e2d..afdf13c34ff5 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -655,7 +655,6 @@ static int exfat_load_upcase_table(struct super_block *sb, unsigned int sect_size = sb->s_blocksize; unsigned int i, index = 0; u32 chksum = 0; - int ret; unsigned char skip = false; unsigned short *upcase_table; @@ -673,8 +672,7 @@ static int exfat_load_upcase_table(struct super_block *sb, if (!bh) { exfat_err(sb, "failed to read sector(0x%llx)", (unsigned long long)sector); - ret = -EIO; - goto free_table; + return -EIO; } sector++; for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) { @@ -701,15 +699,12 @@ static int exfat_load_upcase_table(struct super_block *sb, exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)", index, chksum, utbl_checksum); - ret = -EINVAL; -free_table: - exfat_free_upcase_table(sbi); - return ret; + return -EINVAL; } static int exfat_load_default_upcase_table(struct super_block *sb) { - int i, ret = -EIO; + int i; struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned char skip = false; unsigned short uni = 0, *upcase_table; @@ -740,8 +735,7 @@ static int exfat_load_default_upcase_table(struct super_block *sb) return 0; /* FATAL error: default upcase table has error */ - exfat_free_upcase_table(sbi); - return ret; + return -EIO; } int exfat_create_upcase_table(struct super_block *sb) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index d9d4fa91010b..fcb658267765 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -39,9 +39,6 @@ static void exfat_put_super(struct super_block *sb) exfat_free_bitmap(sbi); brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); - - unload_nls(sbi->nls_io); - exfat_free_upcase_table(sbi); } static int exfat_sync_fs(struct super_block *sb, int wait) @@ -600,7 +597,7 @@ static int __exfat_fill_super(struct super_block *sb) ret = exfat_load_bitmap(sb); if (ret) { exfat_err(sb, "failed to load alloc-bitmap"); - goto free_upcase_table; + goto free_bh; } ret = exfat_count_used_clusters(sb, &sbi->used_clusters); @@ -613,8 +610,6 @@ static int __exfat_fill_super(struct super_block *sb) free_alloc_bitmap: exfat_free_bitmap(sbi); -free_upcase_table: - exfat_free_upcase_table(sbi); free_bh: brelse(sbi->boot_bh); return ret; @@ -701,12 +696,10 @@ put_inode: sb->s_root = NULL; free_table: - exfat_free_upcase_table(sbi); exfat_free_bitmap(sbi); brelse(sbi->boot_bh); check_nls_io: - unload_nls(sbi->nls_io); return err; } @@ -771,13 +764,22 @@ static int exfat_init_fs_context(struct fs_context *fc) return 0; } +static void delayed_free(struct rcu_head *p) +{ + struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu); + + unload_nls(sbi->nls_io); + exfat_free_upcase_table(sbi); + exfat_free_sbi(sbi); +} + static void exfat_kill_sb(struct super_block *sb) { struct exfat_sb_info *sbi = sb->s_fs_info; kill_block_super(sb); if (sbi) - exfat_free_sbi(sbi); + call_rcu(&sbi->rcu, delayed_free); } static struct file_system_type exfat_fs_type = { diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3ae0154c5680..07ea3d62b298 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -255,7 +255,7 @@ static bool filldir_one(struct dir_context *ctx, const char *name, int len, container_of(ctx, struct getdents_callback, ctx); buf->sequence++; - if (buf->ino == ino && len <= NAME_MAX) { + if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a5d784872303..3c0d7d143036 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -252,8 +252,10 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) +#define EXT4_MAP_DELAYED BIT(BH_Delay) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_DELAYED) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -1548,7 +1550,7 @@ struct ext4_sb_info { unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; - struct bdev_handle *s_journal_bdev_handle; + struct file *s_journal_bdev_file; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; @@ -2912,10 +2914,10 @@ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); -extern int ext4_mb_release(struct super_block *); +extern void ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern void ext4_discard_preallocations(struct inode *); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 01299b55a567..7669d154c05e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; @@ -2229,7 +2229,7 @@ static int ext4_fill_es_cache_info(struct inode *inode, /* - * ext4_ext_determine_hole - determine hole around given block + * ext4_ext_find_hole - find hole around given block according to the given path * @inode: inode we lookup in * @path: path in extent tree to @lblk * @lblk: pointer to logical block around which we want to determine hole @@ -2241,9 +2241,9 @@ static int ext4_fill_es_cache_info(struct inode *inode, * The function returns the length of a hole starting at @lblk. We update @lblk * to the beginning of the hole if we managed to find it. */ -static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t *lblk) +static ext4_lblk_t ext4_ext_find_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *lblk) { int depth = ext_depth(inode); struct ext4_extent *ex; @@ -2271,30 +2271,6 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, } /* - * ext4_ext_put_gap_in_cache: - * calculate boundaries of the gap that the requested block fits into - * and cache this gap - */ -static void -ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, - ext4_lblk_t hole_len) -{ - struct extent_status es; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, - hole_start + hole_len - 1, &es); - if (es.es_len) { - /* There's delayed extent containing lblock? */ - if (es.es_lblk <= hole_start) - return; - hole_len = min(es.es_lblk - hole_start, hole_len); - } - ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); - ext4_es_insert_extent(inode, hole_start, hole_len, ~0, - EXTENT_STATUS_HOLE); -} - -/* * ext4_ext_rm_idx: * removes index from the index block. */ @@ -4062,6 +4038,72 @@ static int get_implied_cluster_alloc(struct super_block *sb, return 0; } +/* + * Determine hole length around the given logical block, first try to + * locate and expand the hole from the given @path, and then adjust it + * if it's partially or completely converted to delayed extents, insert + * it into the extent cache tree if it's indeed a hole, finally return + * the length of the determined extent. + */ +static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t lblk) +{ + ext4_lblk_t hole_start, len; + struct extent_status es; + + hole_start = lblk; + len = ext4_ext_find_hole(inode, path, &hole_start); +again: + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, + hole_start + len - 1, &es); + if (!es.es_len) + goto insert_hole; + + /* + * There's a delalloc extent in the hole, handle it if the delalloc + * extent is in front of, behind and straddle the queried range. + */ + if (lblk >= es.es_lblk + es.es_len) { + /* + * The delalloc extent is in front of the queried range, + * find again from the queried start block. + */ + len -= lblk - hole_start; + hole_start = lblk; + goto again; + } else if (in_range(lblk, es.es_lblk, es.es_len)) { + /* + * The delalloc extent containing lblk, it must have been + * added after ext4_map_blocks() checked the extent status + * tree so we are not holding i_rwsem and delalloc info is + * only stabilized by i_data_sem we are going to release + * soon. Don't modify the extent status tree and report + * extent as a hole, just adjust the length to the delalloc + * extent's after lblk. + */ + len = es.es_lblk + es.es_len - lblk; + return len; + } else { + /* + * The delalloc extent is partially or completely behind + * the queried range, update hole length until the + * beginning of the delalloc extent. + */ + len = min(es.es_lblk - hole_start, len); + } + +insert_hole: + /* Put just found gap into cache to speed up subsequent requests */ + ext_debug(inode, " -> %u:%u\n", hole_start, len); + ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE); + + /* Update hole_len to reflect hole size after lblk */ + if (hole_start != lblk) + len -= lblk - hole_start; + + return len; +} /* * Block allocation/map/preallocation routine for extents based files @@ -4179,22 +4221,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * we couldn't try to create block if create flag is zero */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - ext4_lblk_t hole_start, hole_len; + ext4_lblk_t len; - hole_start = map->m_lblk; - hole_len = ext4_ext_determine_hole(inode, path, &hole_start); - /* - * put just found gap into cache to speed up - * subsequent requests - */ - ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); + len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk); - /* Update hole_len to reflect hole size after map->m_lblk */ - if (hole_start != map->m_lblk) - hole_len -= map->m_lblk - hole_start; map->m_pblk = 0; - map->m_len = min_t(unsigned int, map->m_len, hole_len); - + map->m_len = min_t(unsigned int, map->m_len, len); goto out; } @@ -4313,7 +4345,7 @@ got_allocated_blocks: * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, @@ -5357,7 +5389,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); @@ -5365,7 +5397,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start, SHIFT_LEFT); @@ -5497,7 +5529,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6aa15dafc677..54d6ff22585c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -174,7 +174,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 11e6f33677a2..df853c4d3a8c 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -576,9 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; - if (EXT4_SB(sb)->s_journal_bdev_handle && + if (EXT4_SB(sb)->s_journal_bdev_file && fm->fmr_device == - new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev)) + new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev)) return true; return false; } @@ -648,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; - if (EXT4_SB(sb)->s_journal_bdev_handle) { + if (EXT4_SB(sb)->s_journal_bdev_file) { handlers[1].gfd_dev = new_encode_dev( - EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev); + file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a9f3716119d3..d8ca7f64f952 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -714,7 +714,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5af1b0b8680e..2ccf3b5e3a7c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -371,7 +371,7 @@ void ext4_da_update_reserve_space(struct inode *inode, */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, @@ -515,6 +515,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; + map->m_flags |= ext4_es_is_delayed(&es) ? + EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; @@ -1703,11 +1705,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { - if (ext4_es_is_hole(&es)) { - retval = 0; - down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_es_is_hole(&es)) goto add_delayed; - } /* * Delayed extent could be allocated by fallocate. @@ -1749,26 +1748,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, retval = ext4_ext_map_blocks(NULL, inode, map, 0); else retval = ext4_ind_map_blocks(NULL, inode, map, 0); - -add_delayed: - if (retval == 0) { - int ret; - - /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? - */ - - ret = ext4_insert_delayed_block(inode, map->m_lblk); - if (ret != 0) { - retval = ret; - goto out_unlock; - } - - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); - } else if (retval > 0) { + if (retval < 0) { + up_read(&EXT4_I(inode)->i_data_sem); + return retval; + } + if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { @@ -1783,11 +1767,21 @@ add_delayed: EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); + up_read(&EXT4_I(inode)->i_data_sem); + return retval; } + up_read(&EXT4_I(inode)->i_data_sem); -out_unlock: - up_read((&EXT4_I(inode)->i_data_sem)); +add_delayed: + down_write(&EXT4_I(inode)->i_data_sem); + retval = ext4_insert_delayed_block(inode, map->m_lblk); + up_write(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval; + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); return retval; } @@ -3268,6 +3262,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; + } else if (map->m_flags & EXT4_MAP_DELAYED) { + iomap->type = IOMAP_DELALLOC; + iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; @@ -3430,35 +3427,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_end = ext4_iomap_end, }; -static bool ext4_iomap_is_delalloc(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct extent_status es; - ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map->m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) - return false; - - if (es.es_lblk > map->m_lblk) { - map->m_len = es.es_lblk - map->m_lblk; - return false; - } - - offset = map->m_lblk - es.es_lblk; - map->m_len = es.es_len - offset; - - return true; -} - static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; - bool delalloc = false; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; @@ -3499,13 +3472,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; - if (ret == 0) - delalloc = ext4_iomap_is_delalloc(inode, &map); - set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); - if (delalloc && iomap->type == IOMAP_HOLE) - iomap->type = IOMAP_DELALLOC; return 0; } @@ -4015,12 +3983,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) /* If there are blocks to remove, do it */ if (stop_block > first_block) { + ext4_lblk_t hole_len = stop_block - first_block; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, - stop_block - first_block); + ext4_es_remove_extent(inode, first_block, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, @@ -4029,6 +3997,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); + ext4_es_insert_extent(inode, first_block, hole_len, ~0, + EXTENT_STATUS_HOLE); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); @@ -4170,7 +4140,7 @@ int ext4_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index aa6be510eb8f..7160a71044c8 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -467,7 +467,7 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f44f668e407f..e4f7cf9d89c4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -564,14 +564,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -677,7 +677,7 @@ do { \ } \ } while (0) -static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, +static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; @@ -696,7 +696,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) - return 0; + return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); @@ -758,7 +758,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) - return NULL; + return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -768,7 +768,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } - return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ @@ -842,7 +841,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) struct ext4_sb_info *sbi = EXT4_SB(sb); int new_order; - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) return; new_order = mb_avg_fragment_size_order(sb, @@ -871,7 +870,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * cr level needs an update. */ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter; @@ -945,7 +944,7 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -990,7 +989,7 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * much and fall to CR_GOAL_LEN_SLOW in that case. */ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -1125,11 +1124,11 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, } if (*new_cr == CR_POWER2_ALIGNED) { - ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); } else if (*new_cr == CR_GOAL_LEN_FAST) { - ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); } else if (*new_cr == CR_BEST_AVAIL_LEN) { - ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* * TODO: For CR=2, we can arrange groups in an rb tree sorted by @@ -1233,6 +1232,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, atomic64_add(period, &sbi->s_mb_generation_time); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) + mb_set_bits(buddy, 0, count); + + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -1891,11 +1908,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); - this_cpu_inc(discard_pa_seq); - e4b->bd_info->bb_free += count; - if (first < e4b->bd_info->bb_first_free) - e4b->bd_info->bb_first_free = first; - /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ @@ -1909,21 +1921,31 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; + /* + * Fastcommit replay can free already freed blocks which + * corrupts allocation info. Regenerate it. + */ + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + mb_regenerate_buddy(e4b); + goto check; + } + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); - if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block (bit %u); block bitmap corrupt.", - block); - ext4_mark_group_bitmap_corrupted( - sb, e4b->bd_group, + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); - } - goto done; + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, blocknr, + "freeing already freed block (bit %u); block bitmap corrupt.", + block); + return; } + this_cpu_inc(discard_pa_seq); + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; @@ -1948,9 +1970,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); -done: mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); +check: mb_check_buddy(e4b); } @@ -2276,6 +2298,9 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, return; ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { @@ -2283,6 +2308,7 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } @@ -2309,12 +2335,10 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (err) return err; - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { - ext4_mb_unload_buddy(e4b); - return 0; - } - ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ @@ -2347,6 +2371,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); @@ -2380,12 +2405,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { + ext4_mark_group_bitmap_corrupted(ac->ac_sb, + e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); - ext4_mark_group_bitmap_corrupted(ac->ac_sb, - e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } ac->ac_found++; @@ -2436,12 +2461,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * have free blocks */ + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } @@ -2467,12 +2492,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit @@ -3725,7 +3750,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) return count; } -int ext4_mb_release(struct super_block *sb) +void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; @@ -3801,8 +3826,6 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - - return 0; } static inline int ext4_issue_discard(struct super_block *sb, @@ -5284,7 +5307,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { @@ -5334,11 +5357,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, */ } atomic_add(free, &sbi->s_mb_discarded); - - return 0; } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { @@ -5352,13 +5373,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); - return 0; + return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); - - return 0; } /* @@ -5479,7 +5498,7 @@ out_dbg: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_discard_preallocations(struct inode *inode, unsigned int needed) +void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -5491,9 +5510,8 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct rb_node *iter; int err; - if (!S_ISREG(inode->i_mode)) { + if (!S_ISREG(inode->i_mode)) return; - } if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -5501,15 +5519,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, - atomic_read(&ei->i_prealloc_active), needed); - - if (needed == 0) - needed = UINT_MAX; + atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); - for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); @@ -5533,7 +5548,6 @@ repeat: spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); - needed--; continue; } @@ -5943,7 +5957,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* * release all resource we used in allocation */ -static int ext4_mb_release_context(struct ext4_allocation_context *ac) +static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; @@ -5980,7 +5994,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); - return 0; } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) @@ -6761,6 +6774,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) bool set_trimmed = false; void *bitmap; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return 0; + last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; if (start == 0 && max >= last) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d7aeb5da7d86..56938532b4ce 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -192,7 +192,6 @@ struct ext4_allocation_context { */ ext4_grpblk_t ac_orig_goal_len; - __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; __u16 ac_groups_linear_remaining; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 3aa57376d9c2..7cd4afa4de1d 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -618,6 +618,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, goto out; o_end = o_start + len; + *moved_len = 0; while (o_start < o_end) { struct ext4_extent *ex; ext4_lblk_t cur_blk, next_blk; @@ -672,7 +673,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, */ ext4_double_up_write_data_sem(orig_inode, donor_inode); /* Swap original branches with new branches */ - move_extent_per_page(o_filp, donor_inode, + *moved_len += move_extent_per_page(o_filp, donor_inode, orig_page_index, donor_page_index, offset_in_page, cur_len, unwritten, &ret); @@ -682,14 +683,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, o_start += cur_len; d_start += cur_len; } - *moved_len = o_start - orig_blk; - if (*moved_len > len) - *moved_len = len; out: if (*moved_len) { - ext4_discard_preallocations(orig_inode, 0); - ext4_discard_preallocations(donor_inode, 0); + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); } ext4_free_ext_path(path); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 05b647e6bc19..5e4f65c14dfb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1762,7 +1762,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return NULL; if (err) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dcba0f85dfe2..a8ba84eabab2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1359,14 +1359,14 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->s_journal_bdev_handle) { + if (sbi->s_journal_bdev_file) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ - sync_blockdev(sbi->s_journal_bdev_handle->bdev); - invalidate_bdev(sbi->s_journal_bdev_handle->bdev); + sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -1525,7 +1525,7 @@ void ext4_clear_inode(struct inode *inode) ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { @@ -4233,7 +4233,7 @@ int ext4_calculate_overhead(struct super_block *sb) * Add the internal journal blocks whether the journal has been * loaded or not */ - if (sbi->s_journal && !sbi->s_journal_bdev_handle) + if (sbi->s_journal && !sbi->s_journal_bdev_file) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ @@ -5346,7 +5346,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif - memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); @@ -5484,6 +5484,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount4; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); @@ -5670,9 +5671,9 @@ failed_mount: #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); - if (sbi->s_journal_bdev_handle) { - invalidate_bdev(sbi->s_journal_bdev_handle->bdev); - bdev_release(sbi->s_journal_bdev_handle); + if (sbi->s_journal_bdev_file) { + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); + fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5842,30 +5843,30 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb, return journal; } -static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, +static struct file *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; int errno; - bdev_handle = bdev_open_by_dev(j_dev, + bdev_file = bdev_file_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, sb, &fs_holder_ops); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle)); - return bdev_handle; + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); + return bdev_file; } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { @@ -5912,12 +5913,12 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); - return bdev_handle; + return bdev_file; out_bh: brelse(bh); out_bdev: - bdev_release(bdev_handle); + fput(bdev_file); return ERR_PTR(errno); } @@ -5927,14 +5928,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int errno = 0; - bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); - if (IS_ERR(bdev_handle)) - return ERR_CAST(bdev_handle); + bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); - journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start, + journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); @@ -5949,14 +5950,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, goto out_journal; } journal->j_private = sb; - EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle; + EXT4_SB(sb)->s_journal_bdev_file = bdev_file; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - bdev_release(bdev_handle); + fput(bdev_file); return ERR_PTR(errno); } @@ -7314,12 +7315,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb) static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL; + struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL; kill_block_super(sb); - if (handle) - bdev_release(handle); + if (bdev_file) + fput(bdev_file); } static struct file_system_type ext4_fs_type = { diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 75bf1f88843c..645240cc0229 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); - if (IS_ERR(bh)) - return ERR_CAST(bh); - if (!bh || !ext4_buffer_uptodate(bh)) + if (IS_ERR(bh) || !bh) return ERR_PTR(-ECHILD); + if (!ext4_buffer_uptodate(bh)) { + brelse(bh); + return ERR_PTR(-ECHILD); + } } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 65294e3b0bef..4c77e8ce5c75 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,6 +24,7 @@ #include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/part_stat.h> +#include <linux/rw_hint.h> #include <crypto/hash.h> #include <linux/fscrypt.h> @@ -1239,7 +1240,7 @@ struct f2fs_bio_info { #define FDEV(i) (sbi->devs[i]) #define RDEV(i) (raw_super->devs[i]) struct f2fs_dev_info { - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; char path[MAX_PATH_LEN]; unsigned int total_segments; @@ -3364,17 +3365,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } -static inline bool is_dot_dotdot(const u8 *name, size_t len) -{ - if (len == 1 && name[0] == '.') - return true; - - if (len == 2 && name[0] == '.' && name[1] == '.') - return true; - - return false; -} - static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b3bb815fc6aa..f7f63a567d86 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -531,7 +531,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } err = f2fs_prepare_lookup(dir, dentry, &fname); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) goto out_splice; if (err) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4c8836ded90f..e1065ba70207 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1971,9 +1971,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { + unsigned int nofs_flags; + int ret; + trace_f2fs_issue_reset_zone(bdev, blkstart); - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sector, nr_sects, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects); + memalloc_nofs_restore(nofs_flags); + return ret; } __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); @@ -4865,6 +4871,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, block_t zone_block, valid_block_cnt; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int ret; + unsigned int nofs_flags; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; @@ -4912,8 +4919,10 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, "pointer: valid block[0x%x,0x%x] cond[0x%x]", zone_segno, valid_block_cnt, zone->cond); + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, - zone->start, zone->len, GFP_NOFS); + zone->start, zone->len); + memalloc_nofs_restore(nofs_flags); if (ret == -EOPNOTSUPP) { ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, zone->len - (zone->wp - zone->start), diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d00d21a8b53a..b880b746f226 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1605,7 +1605,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) for (i = 0; i < sbi->s_ndevs; i++) { if (i > 0) - bdev_release(FDEV(i).bdev_handle); + fput(FDEV(i).bdev_file); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif @@ -4247,7 +4247,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) for (i = 0; i < max_devices; i++) { if (i == 0) - FDEV(0).bdev_handle = sbi->sb->s_bdev_handle; + FDEV(0).bdev_file = sbi->sb->s_bdev_file; else if (!RDEV(i).path[0]) break; @@ -4267,14 +4267,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).end_blk = FDEV(i).start_blk + (FDEV(i).total_segments << sbi->log_blocks_per_seg) - 1; - FDEV(i).bdev_handle = bdev_open_by_path( + FDEV(i).bdev_file = bdev_file_open_by_path( FDEV(i).path, mode, sbi->sb, NULL); } } - if (IS_ERR(FDEV(i).bdev_handle)) - return PTR_ERR(FDEV(i).bdev_handle); + if (IS_ERR(FDEV(i).bdev_file)) + return PTR_ERR(FDEV(i).bdev_file); - FDEV(i).bdev = FDEV(i).bdev_handle->bdev; + FDEV(i).bdev = file_bdev(FDEV(i).bdev_file); /* to release errored devices */ sbi->s_ndevs = i + 1; @@ -4496,7 +4496,7 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); - memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid)); sb->s_iflags |= SB_I_CGROUPWB; /* init f2fs-specific super block info */ @@ -4660,6 +4660,7 @@ try_onemore: goto free_node_inode; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; @@ -4880,6 +4881,7 @@ free_sbi: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi); + sb->s_fs_info = NULL; /* give only one another chance */ if (retry_cnt > 0 && skip_recovery) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 1fac3dabf130..5c813696d1ff 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1762,6 +1762,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, else /* fat 16 or 12 */ sbi->vol_id = bpb.fat16_vol_id; + __le32 vol_id_le = cpu_to_le32(sbi->vol_id); + super_set_uuid(sb, (void *) &vol_id_le, sizeof(vol_id_le)); + sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; diff --git a/fs/fcntl.c b/fs/fcntl.c index c80a6acad742..54cc85d3338e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -27,6 +27,7 @@ #include <linux/memfd.h> #include <linux/compat.h> #include <linux/mount.h> +#include <linux/rw_hint.h> #include <linux/poll.h> #include <asm/siginfo.h> @@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) } #endif -static bool rw_hint_valid(enum rw_hint hint) +static bool rw_hint_valid(u64 hint) { + BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET); + BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE); + BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT); + BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM); + BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG); + BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME); + switch (hint) { case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: @@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint) } } -static long fcntl_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; - enum rw_hint hint; - u64 h; + u64 hint = READ_ONCE(inode->i_write_hint); - switch (cmd) { - case F_GET_RW_HINT: - h = inode->i_write_hint; - if (copy_to_user(argp, &h, sizeof(*argp))) - return -EFAULT; - return 0; - case F_SET_RW_HINT: - if (copy_from_user(&h, argp, sizeof(h))) - return -EFAULT; - hint = (enum rw_hint) h; - if (!rw_hint_valid(hint)) - return -EINVAL; + if (copy_to_user(argp, &hint, sizeof(*argp))) + return -EFAULT; + return 0; +} - inode_lock(inode); - inode->i_write_hint = hint; - inode_unlock(inode); - return 0; - default: +static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + u64 __user *argp = (u64 __user *)arg; + u64 hint; + + if (copy_from_user(&hint, argp, sizeof(hint))) + return -EFAULT; + if (!rw_hint_valid(hint)) return -EINVAL; - } + + WRITE_ONCE(inode->i_write_hint, hint); + + /* + * file->f_mapping->host may differ from inode. As an example, + * blkdev_open() modifies file->f_mapping. + */ + if (file->f_mapping->host != inode) + WRITE_ONCE(file->f_mapping->host->i_write_hint, hint); + + return 0; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, @@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: + err = fcntl_get_rw_hint(filp, cmd, arg); + break; case F_SET_RW_HINT: - err = fcntl_rw_hint(filp, cmd, arg); + err = fcntl_set_rw_hint(filp, cmd, arg); break; default: break; @@ -846,12 +862,6 @@ int send_sigurg(struct fown_struct *fown) static DEFINE_SPINLOCK(fasync_lock); static struct kmem_cache *fasync_cache __ro_after_init; -static void fasync_free_rcu(struct rcu_head *head) -{ - kmem_cache_free(fasync_cache, - container_of(head, struct fasync_struct, fa_rcu)); -} - /* * Remove a fasync entry. If successfully removed, return * positive and clear the FASYNC flag. If no entry exists, @@ -877,7 +887,7 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; - call_rcu(&fa->fa_rcu, fasync_free_rcu); + kfree_rcu(fa, fa_rcu); filp->f_flags &= ~FASYNC; result = 1; break; diff --git a/fs/fhandle.c b/fs/fhandle.c index 18b3ba8dc8ea..57a12614addf 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path, if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; - handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes, GFP_KERNEL); if (!handle) return -ENOMEM; diff --git a/fs/file_table.c b/fs/file_table.c index b991f90571b4..6925522faa0a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -276,21 +276,15 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) } /** - * alloc_file - allocate and initialize a 'struct file' + * file_init_path - initialize a 'struct file' based on path * + * @file: the file to set up * @path: the (dentry, vfsmount) pair for the new file - * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ -static struct file *alloc_file(const struct path *path, int flags, - const struct file_operations *fop) +static void file_init_path(struct file *file, const struct path *path, + const struct file_operations *fop) { - struct file *file; - - file = alloc_empty_file(flags, current_cred()); - if (IS_ERR(file)) - return file; - file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; @@ -309,22 +303,51 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); +} + +/** + * alloc_file - allocate and initialize a 'struct file' + * + * @path: the (dentry, vfsmount) pair for the new file + * @flags: O_... flags with which the new file will be opened + * @fop: the 'struct file_operations' for the new file + */ +static struct file *alloc_file(const struct path *path, int flags, + const struct file_operations *fop) +{ + struct file *file; + + file = alloc_empty_file(flags, current_cred()); + if (!IS_ERR(file)) + file_init_path(file, path, fop); return file; } -struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, - const char *name, int flags, - const struct file_operations *fops) +static inline int alloc_path_pseudo(const char *name, struct inode *inode, + struct vfsmount *mnt, struct path *path) { struct qstr this = QSTR_INIT(name, strlen(name)); + + path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this); + if (!path->dentry) + return -ENOMEM; + path->mnt = mntget(mnt); + d_instantiate(path->dentry, inode); + return 0; +} + +struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, + const char *name, int flags, + const struct file_operations *fops) +{ + int ret; struct path path; struct file *file; - path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); - if (!path.dentry) - return ERR_PTR(-ENOMEM); - path.mnt = mntget(mnt); - d_instantiate(path.dentry, inode); + ret = alloc_path_pseudo(name, inode, mnt, &path); + if (ret) + return ERR_PTR(ret); + file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); @@ -334,6 +357,30 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, } EXPORT_SYMBOL(alloc_file_pseudo); +struct file *alloc_file_pseudo_noaccount(struct inode *inode, + struct vfsmount *mnt, const char *name, + int flags, + const struct file_operations *fops) +{ + int ret; + struct path path; + struct file *file; + + ret = alloc_path_pseudo(name, inode, mnt, &path); + if (ret) + return ERR_PTR(ret); + + file = alloc_empty_file_noaccount(flags, current_cred()); + if (IS_ERR(file)) { + ihold(inode); + path_put(&path); + return file; + } + file_init_path(file, &path, fops); + return file; +} +EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); + struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1767493dffda..e4f17c53ddfc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb) spin_unlock_irq(&wb->work_lock); } +/* + * This function is used when the first inode for this wb is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). + */ +static void wb_wakeup_delayed(struct bdi_writeback *wb) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + spin_lock_irq(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_irq(&wb->work_lock); +} + static void finish_writeback_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { @@ -1675,11 +1700,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; - else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) { + else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) { if (!(inode->i_state & I_DIRTY_PAGES)) { - inode->i_state &= ~I_PINNING_FSCACHE_WB; - wbc->unpinned_fscache_wb = true; - dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */ + inode->i_state &= ~I_PINNING_NETFS_WB; + wbc->unpinned_netfs_wb = true; + dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */ } } @@ -1691,7 +1716,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (ret == 0) ret = err; } - wbc->unpinned_fscache_wb = false; + wbc->unpinned_netfs_wb = false; trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } diff --git a/fs/fs_parser.c b/fs/fs_parser.c index edb3712dcfa5..a4d6ca0b8971 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -83,8 +83,8 @@ static const struct fs_parameter_spec *fs_lookup_key( } /* - * fs_parse - Parse a filesystem configuration parameter - * @fc: The filesystem context to log errors through. + * __fs_parse - Parse a filesystem configuration parameter + * @log: The filesystem context to log errors through. * @desc: The parameter description to use. * @param: The parameter. * @result: Where to place the result of the parse diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig deleted file mode 100644 index b313a978ae0a..000000000000 --- a/fs/fscache/Kconfig +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config FSCACHE - tristate "General filesystem local caching manager" - select NETFS_SUPPORT - help - This option enables a generic filesystem caching manager that can be - used by various network and other filesystems to cache data locally. - Different sorts of caches can be plugged in, depending on the - resources available. - - See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_STATS - bool "Gather statistical information on local caching" - depends on FSCACHE && PROC_FS - select NETFS_STATS - help - This option causes statistical information to be gathered on local - caching and exported through file: - - /proc/fs/fscache/stats - - The gathering of statistics adds a certain amount of overhead to - execution as there are a quite a few stats gathered, and on a - multi-CPU system these may be on cachelines that keep bouncing - between CPUs. On the other hand, the stats are very useful for - debugging purposes. Saying 'Y' here is recommended. - - See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_DEBUG - bool "Debug FS-Cache" - depends on FSCACHE - help - This permits debugging to be dynamically enabled in the local caching - management module. If this is set, the debugging output may be - enabled by setting bits in /sys/modules/fscache/parameter/debug. - - See Documentation/filesystems/caching/fscache.rst for more information. diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile deleted file mode 100644 index afb090ea16c4..000000000000 --- a/fs/fscache/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for general filesystem caching code -# - -fscache-y := \ - cache.o \ - cookie.o \ - io.o \ - main.o \ - volume.o - -fscache-$(CONFIG_PROC_FS) += proc.o -fscache-$(CONFIG_FSCACHE_STATS) += stats.o - -obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h deleted file mode 100644 index 1336f517e9b1..000000000000 --- a/fs/fscache/internal.h +++ /dev/null @@ -1,277 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* Internal definitions for FS-Cache - * - * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) "FS-Cache: " fmt - -#include <linux/slab.h> -#include <linux/fscache-cache.h> -#include <trace/events/fscache.h> -#include <linux/sched.h> -#include <linux/seq_file.h> - -/* - * cache.c - */ -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_caches_seq_ops; -#endif -bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); -void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); -struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); -void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); - -static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) -{ - return smp_load_acquire(&cache->state); -} - -static inline bool fscache_cache_is_live(const struct fscache_cache *cache) -{ - return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; -} - -static inline void fscache_set_cache_state(struct fscache_cache *cache, - enum fscache_cache_state new_state) -{ - smp_store_release(&cache->state, new_state); - -} - -static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, - enum fscache_cache_state old_state, - enum fscache_cache_state new_state) -{ - return try_cmpxchg_release(&cache->state, &old_state, new_state); -} - -/* - * cookie.c - */ -extern struct kmem_cache *fscache_cookie_jar; -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_cookies_seq_ops; -#endif -extern struct timer_list fscache_cookie_lru_timer; - -extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); -extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, - enum fscache_access_trace why); - -static inline void fscache_see_cookie(struct fscache_cookie *cookie, - enum fscache_cookie_trace where) -{ - trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), - where); -} - -/* - * main.c - */ -extern unsigned fscache_debug; - -extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); - -/* - * proc.c - */ -#ifdef CONFIG_PROC_FS -extern int __init fscache_proc_init(void); -extern void fscache_proc_cleanup(void); -#else -#define fscache_proc_init() (0) -#define fscache_proc_cleanup() do {} while (0) -#endif - -/* - * stats.c - */ -#ifdef CONFIG_FSCACHE_STATS -extern atomic_t fscache_n_volumes; -extern atomic_t fscache_n_volumes_collision; -extern atomic_t fscache_n_volumes_nomem; -extern atomic_t fscache_n_cookies; -extern atomic_t fscache_n_cookies_lru; -extern atomic_t fscache_n_cookies_lru_expired; -extern atomic_t fscache_n_cookies_lru_removed; -extern atomic_t fscache_n_cookies_lru_dropped; - -extern atomic_t fscache_n_acquires; -extern atomic_t fscache_n_acquires_ok; -extern atomic_t fscache_n_acquires_oom; - -extern atomic_t fscache_n_invalidates; - -extern atomic_t fscache_n_relinquishes; -extern atomic_t fscache_n_relinquishes_retire; -extern atomic_t fscache_n_relinquishes_dropped; - -extern atomic_t fscache_n_resizes; -extern atomic_t fscache_n_resizes_null; - -static inline void fscache_stat(atomic_t *stat) -{ - atomic_inc(stat); -} - -static inline void fscache_stat_d(atomic_t *stat) -{ - atomic_dec(stat); -} - -#define __fscache_stat(stat) (stat) - -int fscache_stats_show(struct seq_file *m, void *v); -#else - -#define __fscache_stat(stat) (NULL) -#define fscache_stat(stat) do {} while (0) -#define fscache_stat_d(stat) do {} while (0) -#endif - -/* - * volume.c - */ -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_volumes_seq_ops; -#endif - -struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); -void fscache_put_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); -bool fscache_begin_volume_access(struct fscache_volume *volume, - struct fscache_cookie *cookie, - enum fscache_access_trace why); -void fscache_create_volume(struct fscache_volume *volume, bool wait); - - -/*****************************************************************************/ -/* - * debug tracing - */ -#define dbgprintk(FMT, ...) \ - printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) - -#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) - -#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__) - -#ifdef __KDEBUG -#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) -#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) -#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) - -#elif defined(CONFIG_FSCACHE_DEBUG) -#define _enter(FMT, ...) \ -do { \ - if (__do_kdebug(ENTER)) \ - kenter(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _leave(FMT, ...) \ -do { \ - if (__do_kdebug(LEAVE)) \ - kleave(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _debug(FMT, ...) \ -do { \ - if (__do_kdebug(DEBUG)) \ - kdebug(FMT, ##__VA_ARGS__); \ -} while (0) - -#else -#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) -#endif - -/* - * determine whether a particular optional debugging point should be logged - * - we need to go through three steps to persuade cpp to correctly join the - * shorthand in FSCACHE_DEBUG_LEVEL with its prefix - */ -#define ____do_kdebug(LEVEL, POINT) \ - unlikely((fscache_debug & \ - (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3)))) -#define ___do_kdebug(LEVEL, POINT) \ - ____do_kdebug(LEVEL, POINT) -#define __do_kdebug(POINT) \ - ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT) - -#define FSCACHE_DEBUG_CACHE 0 -#define FSCACHE_DEBUG_COOKIE 1 -#define FSCACHE_DEBUG_OBJECT 2 -#define FSCACHE_DEBUG_OPERATION 3 - -#define FSCACHE_POINT_ENTER 1 -#define FSCACHE_POINT_LEAVE 2 -#define FSCACHE_POINT_DEBUG 4 - -#ifndef FSCACHE_DEBUG_LEVEL -#define FSCACHE_DEBUG_LEVEL CACHE -#endif - -/* - * assertions - */ -#if 1 /* defined(__KDEBUGALL) */ - -#define ASSERT(X) \ -do { \ - if (unlikely(!(X))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTCMP(X, OP, Y) \ -do { \ - if (unlikely(!((X) OP (Y)))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - pr_err("%lx " #OP " %lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTIF(C, X) \ -do { \ - if (unlikely((C) && !(X))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTIFCMP(C, X, OP, Y) \ -do { \ - if (unlikely((C) && !((X) OP (Y)))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - pr_err("%lx " #OP " %lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - BUG(); \ - } \ -} while (0) - -#else - -#define ASSERT(X) do {} while (0) -#define ASSERTCMP(X, OP, Y) do {} while (0) -#define ASSERTIF(C, X) do {} while (0) -#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) - -#endif /* assert or not */ diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 91e89e68177e..b6cad106c37e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -474,8 +474,7 @@ err: static void cuse_fc_release(struct fuse_conn *fc) { - struct cuse_conn *cc = fc_to_cc(fc); - kfree_rcu(cc, fc.rcu); + kfree(fc_to_cc(fc)); } /** diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 148a71b8b4d0..c007b0f0c3a7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2509,14 +2509,14 @@ static int convert_fuse_file_lock(struct fuse_conn *fc, * translate it into the caller's pid namespace. */ rcu_read_lock(); - fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); + fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); rcu_read_unlock(); break; default: return -EIO; } - fl->fl_type = ffl->type; + fl->c.flc_type = ffl->type; return 0; } @@ -2530,10 +2530,10 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, memset(inarg, 0, sizeof(*inarg)); inarg->fh = ff->fh; - inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner); inarg->lk.start = fl->fl_start; inarg->lk.end = fl->fl_end; - inarg->lk.type = fl->fl_type; + inarg->lk.type = fl->c.flc_type; inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; @@ -2570,8 +2570,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; - int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; - struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; + int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL; pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; @@ -2581,7 +2581,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) } /* Unlock on close is handled by the flush method */ - if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) + if ((fl->c.flc_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) return 0; fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1df83eebda92..bcbe34488862 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -888,6 +888,7 @@ struct fuse_mount { /* Entry on fc->mounts */ struct list_head fc_entry; + struct rcu_head rcu; }; static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2a6d44f91729..516ea2979a90 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -930,6 +930,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, } EXPORT_SYMBOL_GPL(fuse_conn_init); +static void delayed_release(struct rcu_head *p) +{ + struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + + put_user_ns(fc->user_ns); + fc->release(fc); +} + void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { @@ -941,13 +949,12 @@ void fuse_conn_put(struct fuse_conn *fc) if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); - put_user_ns(fc->user_ns); bucket = rcu_dereference_protected(fc->curr_bucket, 1); if (bucket) { WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } - fc->release(fc); + call_rcu(&fc->rcu, delayed_release); } } EXPORT_SYMBOL_GPL(fuse_conn_put); @@ -1366,7 +1373,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init); void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); - kfree_rcu(fc, rcu); + kfree(fc); } EXPORT_SYMBOL_GPL(fuse_free_conn); @@ -1902,7 +1909,7 @@ static void fuse_sb_destroy(struct super_block *sb) void fuse_mount_destroy(struct fuse_mount *fm) { fuse_conn_put(fm->fc); - kfree(fm); + kfree_rcu(fm, rcu); } EXPORT_SYMBOL(fuse_mount_destroy); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d9ccfd27e4f1..789af5c8fade 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2465,7 +2465,7 @@ out: } static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, unsigned int len) { int ret; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 177f1f41f225..2e215e8c3c88 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -32,25 +32,21 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent = NULL; + struct dentry *parent; struct gfs2_sbd *sdp; struct gfs2_inode *dip; - struct inode *dinode, *inode; + struct inode *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error, valid = 0; int had_lock = 0; - if (flags & LOOKUP_RCU) { - dinode = d_inode_rcu(READ_ONCE(dentry->d_parent)); - if (!dinode) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dinode = d_inode(parent); - } - sdp = GFS2_SB(dinode); - dip = GFS2_I(dinode); + if (flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(d_inode(parent)); + dip = GFS2_I(d_inode(parent)); inode = d_inode(dentry); if (inode) { @@ -66,8 +62,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, - flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) goto out; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 992ca4effb50..4c42ada60ae7 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1440,10 +1440,10 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); struct lm_lockstruct *ls = &sdp->sd_lockstruct; - if (!(fl->fl_flags & FL_POSIX)) + if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; if (gfs2_withdrawing_or_withdrawn(sdp)) { - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) locks_lock_file_wait(file, fl); return -EIO; } @@ -1451,7 +1451,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); else if (IS_GETLK(cmd)) return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); else return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); @@ -1483,7 +1483,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int error = 0; int sleeptime; - state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; + state = lock_is_write(fl) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; flags = GL_EXACT | GL_NOPID; if (!IS_SETLKW(cmd)) flags |= LM_FLAG_TRY_1CB; @@ -1495,8 +1495,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (fl_gh->gh_state == state) goto out; locks_init_lock(&request); - request.fl_type = F_UNLCK; - request.fl_flags = FL_FLOCK; + request.c.flc_type = F_UNLCK; + request.c.flc_flags = FL_FLOCK; locks_lock_file_wait(file, &request); gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); @@ -1557,10 +1557,10 @@ static void do_unflock(struct file *file, struct file_lock *fl) static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) { - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; - if (fl->fl_type == F_UNLCK) { + if (lock_is_unlock(fl)) { do_unflock(file, fl); return 0; } else { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6bfc9383b7b8..1b95db2c3aac 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, WARN_ON_ONCE(!may_not_block); return -ECHILD; } - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - int noblock = may_not_block ? GL_NOBLOCK : 0; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, - LM_FLAG_ANY | noblock, &i_gh); + if (gfs2_glock_is_locked_by_me(gl) == NULL) { + if (may_not_block) + return -ECHILD; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1281e60be639..572d58e86296 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -214,7 +214,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); - memcpy(&s->s_uuid, str->sb_uuid, 16); + super_set_uuid(s, str->sb_uuid, 16); } /** diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 7ededcb720c1..012a3d003fbe 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -190,6 +190,7 @@ struct hfsplus_sb_info { int work_queued; /* non-zero delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ + struct rcu_head rcu; }; #define HFSPLUS_SB_WRITEBACKUP 0 diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 1986b4f18a90..97920202790f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -277,6 +277,14 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb) spin_unlock(&sbi->work_lock); } +static void delayed_free(struct rcu_head *p) +{ + struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu); + + unload_nls(sbi->nls); + kfree(sbi); +} + static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); @@ -302,9 +310,7 @@ static void hfsplus_put_super(struct super_block *sb) hfs_btree_close(sbi->ext_tree); kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); - unload_nls(sbi->nls); - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; + call_rcu(&sbi->rcu, delayed_free); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index b0cb70400996..ce9346099c72 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -30,7 +30,7 @@ struct hfsplus_wd { * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data - * @opf: request op flags + * @opf: I/O operation type and flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ea5b8e57d904..6502c7e776d1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -100,6 +100,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); + vm_flags_t vm_flags; /* * vma address alignment (but not the pgoff alignment) has @@ -141,10 +142,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); ret = -ENOMEM; + + vm_flags = vma->vm_flags; + /* + * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip + * reserving here. Note: only for SHM hugetlbfs file, the inode + * flag S_PRIVATE is set. + */ + if (inode->i_flags & S_PRIVATE) + vm_flags |= VM_NORESERVE; + if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, - vma->vm_flags)) + vm_flags)) goto out; ret = 0; @@ -340,7 +351,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { folio_unlock(folio); - if (!folio_test_has_hwpoisoned(folio)) + if (!folio_test_hwpoison(folio)) want = nr; else { /* @@ -922,7 +933,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); - error = setattr_prepare(&nop_mnt_idmap, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; @@ -939,7 +950,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, hugetlb_vmtruncate(inode, newsize); } - setattr_copy(&nop_mnt_idmap, inode, attr); + setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -974,6 +985,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, + struct mnt_idmap *idmap, struct inode *dir, umode_t mode, dev_t dev) { @@ -995,7 +1007,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); - inode_init_owner(&nop_mnt_idmap, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; @@ -1039,7 +1051,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, { struct inode *inode; - inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); + inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); @@ -1051,7 +1063,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, + int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); @@ -1062,7 +1074,7 @@ static int hugetlbfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); + return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, @@ -1071,7 +1083,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, { struct inode *inode; - inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); + inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); @@ -1083,10 +1095,11 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { + const umode_t mode = S_IFLNK|S_IRWXUGO; struct inode *inode; int error = -ENOSPC; - inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); + inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); @@ -1354,6 +1367,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; + struct hstate *h; char *rest; unsigned long ps; int opt; @@ -1398,11 +1412,12 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_pagesize: ps = memparse(param->string, &rest); - ctx->hstate = size_to_hstate(ps); - if (!ctx->hstate) { + h = size_to_hstate(ps); + if (!h) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } + ctx->hstate = h; return 0; case Opt_min_size: @@ -1553,6 +1568,7 @@ static struct file_system_type hugetlbfs_fs_type = { .init_fs_context = hugetlbfs_init_fs_context, .parameters = hugetlb_fs_parameters, .kill_sb = kill_litter_super, + .fs_flags = FS_ALLOW_IDMAP, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1606,7 +1622,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, } file = ERR_PTR(-ENOSPC); - inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0); + /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts. */ + inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL, + S_IFREG | S_IRWXUGO, 0); if (!inode) goto out; if (creat_flags == HUGETLB_SHMFS_INODE) diff --git a/fs/inode.c b/fs/inode.c index 91048c4c9c9e..d290f007b3d1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -20,6 +20,7 @@ #include <linux/ratelimit.h> #include <linux/list_lru.h> #include <linux/iversion.h> +#include <linux/rw_hint.h> #include <trace/events/writeback.h> #include "internal.h" @@ -588,7 +589,8 @@ void dump_mapping(const struct address_space *mapping) } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { + if (get_kernel_nofault(dentry, dentry_ptr) || + !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; @@ -2285,7 +2287,7 @@ void __init inode_init(void) sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ @@ -2509,7 +2511,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now = current_time(inode); - inode_set_ctime(inode, now.tv_sec, now.tv_nsec); + inode_set_ctime_to_ts(inode, now); return now; } EXPORT_SYMBOL(inode_set_ctime_current); diff --git a/fs/internal.h b/fs/internal.h index b67406435fc0..49c1fcfee4b3 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); +long do_ftruncate(struct file *file, loff_t length, int small); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, @@ -310,3 +311,10 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); +struct stashed_operations { + void (*put_data)(void *data); + void (*init_inode)(struct inode *inode, void *data); +}; +int path_from_stashed(struct dentry **stashed, unsigned long ino, + struct vfsmount *mnt, void *data, struct path *path); +void stashed_dentry_prune(struct dentry *dentry); diff --git a/fs/ioctl.c b/fs/ioctl.c index 76cf22ac97d7..1d5abfdf0f22 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -763,6 +763,33 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) return err; } +static int ioctl_getfsuuid(struct file *file, void __user *argp) +{ + struct super_block *sb = file_inode(file)->i_sb; + struct fsuuid2 u = { .len = sb->s_uuid_len, }; + + if (!sb->s_uuid_len) + return -ENOIOCTLCMD; + + memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len); + + return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; +} + +static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp) +{ + struct super_block *sb = file_inode(file)->i_sb; + + if (!strlen(sb->s_sysfs_name)) + return -ENOIOCTLCMD; + + struct fs_sysfs_path u = {}; + + u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name); + + return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; +} + /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. @@ -845,6 +872,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, case FS_IOC_FSSETXATTR: return ioctl_fssetxattr(filp, argp); + case FS_IOC_GETFSUUID: + return ioctl_getfsuuid(filp, argp); + + case FS_IOC_GETFSSYSFSPATH: + return ioctl_get_fs_sysfs_path(filp, argp); + default: if (S_ISREG(inode->i_mode)) return file_ioctl(filp, cmd, argp); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 093c4515b22a..4e8e41c8b3c0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (C) 2016-2019 Christoph Hellwig. + * Copyright (C) 2016-2023 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio, return test_bit(block + blks_per_folio, ifs->state); } +static unsigned ifs_find_dirty_range(struct folio *folio, + struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) +{ + struct inode *inode = folio->mapping->host; + unsigned start_blk = + offset_in_folio(folio, *range_start) >> inode->i_blkbits; + unsigned end_blk = min_not_zero( + offset_in_folio(folio, range_end) >> inode->i_blkbits, + i_blocks_per_folio(inode, folio)); + unsigned nblks = 1; + + while (!ifs_block_is_dirty(folio, ifs, start_blk)) + if (++start_blk == end_blk) + return 0; + + while (start_blk + nblks < end_blk) { + if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) + break; + nblks++; + } + + *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); + return nblks << inode->i_blkbits; +} + +static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, + u64 range_end) +{ + struct iomap_folio_state *ifs = folio->private; + + if (*range_start >= range_end) + return 0; + + if (ifs) + return ifs_find_dirty_range(folio, ifs, range_start, range_end); + return range_end - *range_start; +} + static void ifs_clear_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { @@ -1454,15 +1492,10 @@ out_unlock: EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, - size_t len, int error) + size_t len) { struct iomap_folio_state *ifs = folio->private; - if (error) { - folio_set_error(folio); - mapping_set_error(inode->i_mapping, error); - } - WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); @@ -1479,40 +1512,29 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_inline_bio; - struct bio *last = ioend->io_bio, *next; - u64 start = bio->bi_iter.bi_sector; - loff_t offset = ioend->io_offset; - bool quiet = bio_flagged(bio, BIO_QUIET); + struct bio *bio = &ioend->io_bio; + struct folio_iter fi; u32 folio_count = 0; - for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct folio_iter fi; - - /* - * For the last bio, bi_private points to the ioend, so we - * need to explicitly end the iteration here. - */ - if (bio == last) - next = NULL; - else - next = bio->bi_private; - - /* walk all folios in bio, ending page IO on them */ - bio_for_each_folio_all(fi, bio) { - iomap_finish_folio_write(inode, fi.folio, fi.length, - error); - folio_count++; + if (error) { + mapping_set_error(inode->i_mapping, error); + if (!bio_flagged(bio, BIO_QUIET)) { + pr_err_ratelimited( +"%s: writeback error on inode %lu, offset %lld, sector %llu", + inode->i_sb->s_id, inode->i_ino, + ioend->io_offset, ioend->io_sector); } - bio_put(bio); } - /* The ioend has been freed by bio_put() */ - if (unlikely(error && !quiet)) { - printk_ratelimited(KERN_ERR -"%s: writeback error on inode %lu, offset %lld, sector %llu", - inode->i_sb->s_id, inode->i_ino, offset, start); + /* walk all folios in bio, ending page IO on them */ + bio_for_each_folio_all(fi, bio) { + if (error) + folio_set_error(fi.folio); + iomap_finish_folio_write(inode, fi.folio, fi.length); + folio_count++; } + + bio_put(bio); /* frees the ioend */ return folio_count; } @@ -1553,7 +1575,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends); static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { - if (ioend->io_bio->bi_status != next->io_bio->bi_status) + if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; if ((ioend->io_flags & IOMAP_F_SHARED) ^ (next->io_flags & IOMAP_F_SHARED)) @@ -1618,47 +1640,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends); static void iomap_writepage_end_bio(struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; - - iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); + iomap_finish_ioend(iomap_ioend_from_bio(bio), + blk_status_to_errno(bio->bi_status)); } /* * Submit the final bio for an ioend. * * If @error is non-zero, it means that we have a situation where some part of - * the submission process has failed after we've marked pages for writeback - * and unlocked them. In this situation, we need to fail the bio instead of - * submitting it. This typically only happens on a filesystem shutdown. + * the submission process has failed after we've marked pages for writeback. + * We cannot cancel ioend directly in that case, so call the bio end I/O handler + * with the error status here to run the normal I/O completion handler to clear + * the writeback bit and let the file system proess the errors. */ -static int -iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, - int error) +static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) { - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = iomap_writepage_end_bio; + if (!wpc->ioend) + return error; + /* + * Let the file systems prepare the I/O submission and hook in an I/O + * comletion handler. This also needs to happen in case after a + * failure happened so that the file system end I/O handler gets called + * to clean up. + */ if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(ioend, error); + error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (error) { - /* - * If we're failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately - * as there is only one reference to the ioend at this point in - * time. - */ - ioend->io_bio->bi_status = errno_to_blk_status(error); - bio_endio(ioend->io_bio); - return error; + wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); + bio_endio(&wpc->ioend->io_bio); + } else { + submit_bio(&wpc->ioend->io_bio); } - submit_bio(ioend->io_bio); - return 0; + wpc->ioend = NULL; + return error; } -static struct iomap_ioend * -iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, - loff_t offset, sector_t sector, struct writeback_control *wbc) +static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct inode *inode, loff_t pos) { struct iomap_ioend *ioend; struct bio *bio; @@ -1666,63 +1687,42 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS, &iomap_ioend_bioset); - bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); + bio->bi_end_io = iomap_writepage_end_bio; wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; - ioend = container_of(bio, struct iomap_ioend, io_inline_bio); + ioend = iomap_ioend_from_bio(bio); INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = wpc->iomap.type; ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; - ioend->io_folios = 0; - ioend->io_offset = offset; - ioend->io_bio = bio; - ioend->io_sector = sector; - return ioend; -} - -/* - * Allocate a new bio, and chain the old bio to the new one. - * - * Note that we have to perform the chaining in this unintuitive order - * so that the bi_private linkage is set up in the right direction for the - * traversal in iomap_finish_ioend(). - */ -static struct bio * -iomap_chain_bio(struct bio *prev) -{ - struct bio *new; - - new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); - bio_clone_blkg_association(new, prev); - new->bi_iter.bi_sector = bio_end_sector(prev); + ioend->io_offset = pos; + ioend->io_sector = bio->bi_iter.bi_sector; - bio_chain(prev, new); - bio_get(prev); /* for iomap_finish_ioend */ - submit_bio(prev); - return new; + wpc->nr_folios = 0; + return ioend; } -static bool -iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, - sector_t sector) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) { if ((wpc->iomap.flags & IOMAP_F_SHARED) != (wpc->ioend->io_flags & IOMAP_F_SHARED)) return false; if (wpc->iomap.type != wpc->ioend->io_type) return false; - if (offset != wpc->ioend->io_offset + wpc->ioend->io_size) + if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (sector != bio_end_sector(wpc->ioend->io_bio)) + if (iomap_sector(&wpc->iomap, pos) != + bio_end_sector(&wpc->ioend->io_bio)) return false; /* * Limit ioend bio chain lengths to minimise IO completion latency. This * also prevents long tight loops ending page writeback on all the * folios in the ioend. */ - if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) + if (wpc->nr_folios >= IOEND_BATCH_SIZE) return false; return true; } @@ -1730,255 +1730,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, /* * Test to see if we have an existing ioend structure that we could append to * first; otherwise finish off the current ioend and start another. + * + * If a new ioend is created and cached, the old ioend is submitted to the block + * layer instantly. Batching optimisations are provided by higher level block + * plugging. + * + * At the end of a writeback pass, there will be a cached ioend remaining on the + * writepage context that the caller will need to submit. */ -static void -iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, - struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct list_head *iolist) +static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio, + struct inode *inode, loff_t pos, unsigned len) { - sector_t sector = iomap_sector(&wpc->iomap, pos); - unsigned len = i_blocksize(inode); + struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) { - if (wpc->ioend) - list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc); + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { +new_ioend: + error = iomap_submit_ioend(wpc, 0); + if (error) + return error; + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); } - if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { - wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); - bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); - } + if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) + goto new_ioend; if (ifs) atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, &folio->page, len); + return 0; } -/* - * We implement an immediate ioend submission policy here to avoid needing to - * chain multiple ioends and hence nest mempool allocations which can violate - * the forward progress guarantees we need to provide. The current ioend we're - * adding blocks to is cached in the writepage context, and if the new block - * doesn't append to the cached ioend, it will create a new ioend and cache that - * instead. - * - * If a new ioend is created and cached, the old ioend is returned and queued - * locally for submission once the entire page is processed or an error has been - * detected. While ioends are submitted immediately after they are completed, - * batching optimisations are provided by higher level block plugging. - * - * At the end of a writeback pass, there will be a cached ioend remaining on the - * writepage context that the caller will need to submit. - */ -static int -iomap_writepage_map(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, - struct folio *folio, u64 end_pos) +static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio, + struct inode *inode, u64 pos, unsigned dirty_len, + unsigned *count) { - struct iomap_folio_state *ifs = folio->private; - struct iomap_ioend *ioend, *next; - unsigned len = i_blocksize(inode); - unsigned nblocks = i_blocks_per_folio(inode, folio); - u64 pos = folio_pos(folio); - int error = 0, count = 0, i; - LIST_HEAD(submit_list); - - WARN_ON_ONCE(end_pos <= pos); - - if (!ifs && nblocks > 1) { - ifs = ifs_alloc(inode, folio, 0); - iomap_set_range_dirty(folio, 0, end_pos - pos); - } + int error; - WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0); - - /* - * Walk through the folio to find areas to write back. If we - * run off the end of the current map or find the current map - * invalid, grab a new one. - */ - for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { - if (ifs && !ifs_block_is_dirty(folio, ifs, i)) - continue; + do { + unsigned map_len; - error = wpc->ops->map_blocks(wpc, inode, pos); + error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len); if (error) break; - trace_iomap_writepage_map(inode, &wpc->iomap); - if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) - continue; - if (wpc->iomap.type == IOMAP_HOLE) - continue; - iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc, - &submit_list); - count++; - } - if (count) - wpc->ioend->io_folios++; + trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap); - WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); - WARN_ON_ONCE(!folio_test_locked(folio)); - WARN_ON_ONCE(folio_test_writeback(folio)); - WARN_ON_ONCE(folio_test_dirty(folio)); + map_len = min_t(u64, dirty_len, + wpc->iomap.offset + wpc->iomap.length - pos); + WARN_ON_ONCE(!folio->private && map_len < dirty_len); + + switch (wpc->iomap.type) { + case IOMAP_INLINE: + WARN_ON_ONCE(1); + error = -EIO; + break; + case IOMAP_HOLE: + break; + default: + error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, + map_len); + if (!error) + (*count)++; + break; + } + dirty_len -= map_len; + pos += map_len; + } while (dirty_len && !error); /* * We cannot cancel the ioend directly here on error. We may have * already set other pages under writeback and hence we have to run I/O * completion to mark the error state of the pages under writeback * appropriately. + * + * Just let the file system know what portion of the folio failed to + * map. */ - if (unlikely(error)) { - /* - * Let the filesystem know what portion of the current page - * failed to map. If the page hasn't been added to ioend, it - * won't be affected by I/O completion and we must unlock it - * now. - */ - if (wpc->ops->discard_folio) - wpc->ops->discard_folio(folio, pos); - if (!count) { - folio_unlock(folio); - goto done; - } - } - - /* - * We can have dirty bits set past end of file in page_mkwrite path - * while mapping the last partial folio. Hence it's better to clear - * all the dirty bits in the folio here. - */ - iomap_clear_range_dirty(folio, 0, folio_size(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - - /* - * Preserve the original error if there was one; catch - * submission errors here and propagate into subsequent ioend - * submissions. - */ - list_for_each_entry_safe(ioend, next, &submit_list, io_list) { - int error2; - - list_del_init(&ioend->io_list); - error2 = iomap_submit_ioend(wpc, ioend, error); - if (error2 && !error) - error = error2; - } - - /* - * We can end up here with no error and nothing to write only if we race - * with a partial page truncate on a sub-page block sized filesystem. - */ - if (!count) - folio_end_writeback(folio); -done: - mapping_set_error(inode->i_mapping, error); + if (error && wpc->ops->discard_folio) + wpc->ops->discard_folio(folio, pos); return error; } /* - * Write out a dirty page. + * Check interaction of the folio with the file end. * - * For delalloc space on the page, we need to allocate space and flush it. - * For unwritten space on the page, we need to start the conversion to - * regular allocated space. + * If the folio is entirely beyond i_size, return false. If it straddles + * i_size, adjust end_pos and zero all data beyond i_size. */ -static int iomap_do_writepage(struct folio *folio, - struct writeback_control *wbc, void *data) +static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, + u64 *end_pos) { - struct iomap_writepage_ctx *wpc = data; - struct inode *inode = folio->mapping->host; - u64 end_pos, isize; - - trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio)); + u64 isize = i_size_read(inode); - /* - * Refuse to write the folio out if we're called from reclaim context. - * - * This avoids stack overflows when called from deeply used stacks in - * random callers for direct reclaim or memcg reclaim. We explicitly - * allow reclaim from kswapd as the stack usage there is relatively low. - * - * This should never happen except in the case of a VM regression so - * warn about it. - */ - if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC)) - goto redirty; - - /* - * Is this folio beyond the end of the file? - * - * The folio index is less than the end_index, adjust the end_pos - * to the highest offset that this folio should represent. - * ----------------------------------------------------- - * | file mapping | <EOF> | - * ----------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | | - * ^--------------------------------^----------|-------- - * | desired writeback range | see else | - * ---------------------------------^------------------| - */ - isize = i_size_read(inode); - end_pos = folio_pos(folio) + folio_size(folio); - if (end_pos > isize) { - /* - * Check whether the page to write out is beyond or straddles - * i_size or not. - * ------------------------------------------------------- - * | file mapping | <EOF> | - * ------------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | - * ^--------------------------------^-----------|--------- - * | | Straddles | - * ---------------------------------^-----------|--------| - */ + if (*end_pos > isize) { size_t poff = offset_in_folio(folio, isize); pgoff_t end_index = isize >> PAGE_SHIFT; /* - * Skip the page if it's fully outside i_size, e.g. - * due to a truncate operation that's in progress. We've - * cleaned this page and truncate will finish things off for - * us. + * If the folio is entirely ouside of i_size, skip it. + * + * This can happen due to a truncate operation that is in + * progress and in that case truncate will finish it off once + * we've dropped the folio lock. * - * Note that the end_index is unsigned long. If the given - * offset is greater than 16TB on a 32-bit system then if we - * checked if the page is fully outside i_size with - * "if (page->index >= end_index + 1)", "end_index + 1" would - * overflow and evaluate to 0. Hence this page would be + * Note that the pgoff_t used for end_index is an unsigned long. + * If the given offset is greater than 16TB on a 32-bit system, + * then if we checked if the folio is fully outside i_size with + * "if (folio->index >= end_index + 1)", "end_index + 1" would + * overflow and evaluate to 0. Hence this folio would be * redirtied and written out repeatedly, which would result in * an infinite loop; the user program performing this operation * would hang. Instead, we can detect this situation by - * checking if the page is totally beyond i_size or if its + * checking if the folio is totally beyond i_size or if its * offset is just equal to the EOF. */ if (folio->index > end_index || (folio->index == end_index && poff == 0)) - goto unlock; + return false; /* - * The page straddles i_size. It must be zeroed out on each - * and every writepage invocation because it may be mmapped. - * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining - * memory is zeroed when mapped, and writes to that region are - * not written out to the file." + * The folio straddles i_size. + * + * It must be zeroed out on each and every writepage invocation + * because it may be mmapped: + * + * A file is mapped in multiples of the page size. For a + * file that is not a multiple of the page size, the + * remaining memory is zeroed when mapped, and writes to that + * region are not written out to the file. + * + * Also adjust the writeback range to skip all blocks entirely + * beyond i_size. */ folio_zero_segment(folio, poff, folio_size(folio)); - end_pos = isize; + *end_pos = round_up(isize, i_blocksize(inode)); + } + + return true; +} + +static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + u64 pos = folio_pos(folio); + u64 end_pos = pos + folio_size(folio); + unsigned count = 0; + int error = 0; + u32 rlen; + + WARN_ON_ONCE(!folio_test_locked(folio)); + WARN_ON_ONCE(folio_test_dirty(folio)); + WARN_ON_ONCE(folio_test_writeback(folio)); + + trace_iomap_writepage(inode, pos, folio_size(folio)); + + if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) { + folio_unlock(folio); + return 0; + } + WARN_ON_ONCE(end_pos <= pos); + + if (i_blocks_per_folio(inode, folio) > 1) { + if (!ifs) { + ifs = ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, end_pos - pos); + } + + /* + * Keep the I/O completion handler from clearing the writeback + * bit until we have submitted all blocks by adding a bias to + * ifs->write_bytes_pending, which is dropped after submitting + * all blocks. + */ + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + atomic_inc(&ifs->write_bytes_pending); } - return iomap_writepage_map(wpc, wbc, inode, folio, end_pos); + /* + * Set the writeback bit ASAP, as the I/O completion for the single + * block per folio case happen hit as soon as we're submitting the bio. + */ + folio_start_writeback(folio); -redirty: - folio_redirty_for_writepage(wbc, folio); -unlock: + /* + * Walk through the folio to find dirty areas to write back. + */ + while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { + error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, + pos, rlen, &count); + if (error) + break; + pos += rlen; + } + + if (count) + wpc->nr_folios++; + + /* + * We can have dirty bits set past end of file in page_mkwrite path + * while mapping the last partial folio. Hence it's better to clear + * all the dirty bits in the folio here. + */ + iomap_clear_range_dirty(folio, 0, folio_size(folio)); + + /* + * Usually the writeback bit is cleared by the I/O completion handler. + * But we may end up either not actually writing any blocks, or (when + * there are multiple blocks in a folio) all I/O might have finished + * already at this point. In that case we need to clear the writeback + * bit ourselves right after unlocking the page. + */ folio_unlock(folio); - return 0; + if (ifs) { + if (atomic_dec_and_test(&ifs->write_bytes_pending)) + folio_end_writeback(folio); + } else { + if (!count) + folio_end_writeback(folio); + } + mapping_set_error(inode->i_mapping, error); + return error; +} + +static int iomap_do_writepage(struct folio *folio, + struct writeback_control *wbc, void *data) +{ + return iomap_writepage_map(data, wbc, folio); } int @@ -1988,18 +1971,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, { int ret; + /* + * Writeback from reclaim context should never happen except in the case + * of a VM regression so warn about it and refuse to write the data. + */ + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) == + PF_MEMALLOC)) + return -EIO; + wpc->ops = ops; ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); - if (!wpc->ioend) - return ret; - return iomap_submit_ioend(wpc, wpc->ioend, ret); + return iomap_submit_ioend(wpc, ret); } EXPORT_SYMBOL_GPL(iomap_writepages); static int __init iomap_init(void) { return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_inline_bio), + offsetof(struct iomap_ioend, io_bio), BIOSET_NEED_BVECS); } fs_initcall(iomap_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bcd3f8cf5ea4..f3b43d223a46 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio->bi_write_hint = inode->i_write_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index c16fd55f5595..0a991c4ce87d 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name, \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); -DEFINE_IOMAP_EVENT(iomap_writepage_map); + +TRACE_EVENT(iomap_writepage_map, + TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len, + struct iomap *iomap), + TP_ARGS(inode, pos, dirty_len, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(u64, pos) + __field(u64, dirty_len) + __field(u64, addr) + __field(loff_t, offset) + __field(u64, length) + __field(u16, type) + __field(u16, flags) + __field(dev_t, bdev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->dirty_len = dirty_len; + __entry->addr = iomap->addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; + ), + TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx " + "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + MAJOR(__entry->bdev), MINOR(__entry->bdev), + __entry->pos, + __entry->dirty_len, + __entry->addr, + __entry->offset, + __entry->length, + __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) +); TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, @@ -165,6 +206,7 @@ TRACE_EVENT(iomap_iter, __field(u64, ino) __field(loff_t, pos) __field(u64, length) + __field(s64, processed) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -174,15 +216,17 @@ TRACE_EVENT(iomap_iter, __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); + __entry->processed = iter->processed; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, + __entry->processed, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 8eec84c651bf..cb3cda1390ad 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2763,9 +2763,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) * leafno - the number of the leaf to be updated. * newval - the new value for the leaf. * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * RETURN VALUES: none */ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { @@ -2792,10 +2790,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) * get the buddy size (number of words covered) of * the new value. */ - - if ((newval - tp->dmt_budmin) > BUDMIN) - return -EIO; - budsz = BUDSIZE(newval, tp->dmt_budmin); /* try to join. diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index cb6d1fda66a7..73389c68e251 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync) int lmLogOpen(struct super_block *sb) { int rc; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct jfs_log *log; struct jfs_sb_info *sbi = JFS_SBI(sb); @@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb) mutex_lock(&jfs_log_mutex); list_for_each_entry(log, &jfs_external_logs, journal_list) { - if (log->bdev_handle->bdev->bd_dev == sbi->logdev) { + if (file_bdev(log->bdev_file)->bd_dev == sbi->logdev) { if (!uuid_equal(&log->uuid, &sbi->loguuid)) { jfs_warn("wrong uuid on JFS journal"); mutex_unlock(&jfs_log_mutex); @@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev_handle = bdev_open_by_dev(sbi->logdev, + bdev_file = bdev_file_open_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL); - if (IS_ERR(bdev_handle)) { - rc = PTR_ERR(bdev_handle); + if (IS_ERR(bdev_file)) { + rc = PTR_ERR(bdev_file); goto free; } - log->bdev_handle = bdev_handle; + log->bdev_file = bdev_file; uuid_copy(&log->uuid, &sbi->loguuid); /* @@ -1141,7 +1141,7 @@ journal_found: lbmLogShutdown(log); close: /* close external log device */ - bdev_release(bdev_handle); + fput(bdev_file); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb) init_waitqueue_head(&log->syncwait); set_bit(log_INLINELOG, &log->flag); - log->bdev_handle = sb->s_bdev_handle; + log->bdev_file = sb->s_bdev_file; log->base = addressPXD(&JFS_SBI(sb)->logpxd); log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> (L2LOGPSIZE - sb->s_blocksize_bits); @@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int rc = 0; jfs_info("lmLogClose: log:0x%p", log); @@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb) * external log as separate logical volume */ list_del(&log->journal_list); - bdev_handle = log->bdev_handle; + bdev_file = log->bdev_file; rc = lmLogShutdown(log); - bdev_release(bdev_handle); + fput(bdev_file); kfree(log); @@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bp->l_flag |= lbmREAD; - bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS); + bio = bio_alloc(file_bdev(log->bdev_file), 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); @@ -2115,7 +2115,7 @@ static void lbmStartIO(struct lbuf * bp) jfs_info("lbmStartIO"); if (!log->no_integrity) - bdev = log->bdev_handle->bdev; + bdev = file_bdev(log->bdev_file); bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index 84aa2d253907..8b8994e48cd0 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -356,7 +356,7 @@ struct jfs_log { * before writing syncpt. */ struct list_head journal_list; /* Global list */ - struct bdev_handle *bdev_handle; /* 4: log lv pointer */ + struct file *bdev_file; /* 4: log lv pointer */ int serial; /* 4: log mount serial number */ s64 base; /* @8: log extent address (inline log ) */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 9b5c6a20b30c..98f9a432c336 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -431,7 +431,7 @@ int updateSuper(struct super_block *sb, uint state) if (state == FM_MOUNT) { /* record log's dev_t and mount serial number */ j_sb->s_logdev = cpu_to_le32( - new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev)); + new_encode_dev(file_bdev(sbi->log->bdev_file)->bd_dev)); j_sb->s_logserial = cpu_to_le32(sbi->log->serial); } else if (state == FM_CLEAN) { /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 8d8e556bd610..73f09a762b79 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -932,7 +932,7 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, offsetof(struct jfs_inode_info, i_inline_all), sizeof_field(struct jfs_inode_info, i_inline_all), init_once); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 8b2bd65d70e7..bce1d7ac95ca 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -54,9 +54,9 @@ static bool kernfs_lockdep(struct kernfs_node *kn) static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) { if (!kn) - return strlcpy(buf, "(null)", buflen); + return strscpy(buf, "(null)", buflen); - return strlcpy(buf, kn->parent ? kn->name : "/", buflen); + return strscpy(buf, kn->parent ? kn->name : "/", buflen); } /* kernfs_node_depth - compute depth from @from to @to */ @@ -127,7 +127,7 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * * [3] when @kn_to is %NULL result will be "(null)" * - * Return: the length of the full path. If the full length is equal to or + * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ @@ -138,16 +138,17 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; + ssize_t copied; int i, j; if (!kn_to) - return strlcpy(buf, "(null)", buflen); + return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) - return strlcpy(buf, "/", buflen); + return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) @@ -158,18 +159,19 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, buf[0] = '\0'; - for (i = 0; i < depth_from; i++) - len += strlcpy(buf + len, parent_str, - len < buflen ? buflen - len : 0); + for (i = 0; i < depth_from; i++) { + copied = strscpy(buf + len, parent_str, buflen - len); + if (copied < 0) + return copied; + len += copied; + } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { for (kn = kn_to, j = 0; j < i; j++) kn = kn->parent; - len += strlcpy(buf + len, "/", - len < buflen ? buflen - len : 0); - len += strlcpy(buf + len, kn->name, - len < buflen ? buflen - len : 0); + + len += scnprintf(buf + len, buflen - len, "/%s", kn->name); } return len; @@ -182,12 +184,12 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is - * similar to strlcpy(). + * similar to strscpy(). * * Fills buffer with "(null)" if @kn is %NULL. * - * Return: the length of @kn's name and if @buf isn't long enough, - * it's filled up to @buflen-1 and nul terminated. + * Return: the resulting length of @buf. If @buf isn't long enough, + * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ @@ -214,7 +216,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * Return: the length of the full path. If the full length is equal to or + * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ @@ -265,12 +267,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn) sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { - pr_cont("(error)"); - goto out; - } - - if (sz >= sizeof(kernfs_pr_cont_buf)) { - pr_cont("(name too long)"); + if (sz == -E2BIG) + pr_cont("(name too long)"); + else + pr_cont("(error)"); goto out; } @@ -676,6 +676,18 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, { struct kernfs_node *kn; + if (parent->mode & S_ISGID) { + /* this code block imitates inode_init_owner() for + * kernfs + */ + + if (parent->iattr) + gid = parent->iattr->ia_gid; + + if (flags & KERNFS_DIR) + mode |= S_ISGID; + } + kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { @@ -850,16 +862,16 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { - size_t len; + ssize_t len; char *p, *name; lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); - len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); + len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); - if (len >= sizeof(kernfs_pr_cont_buf)) { + if (len < 0) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index f0cb729e9a97..ffa4565c275a 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -447,7 +447,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the - * comment in kernfs_file_open() for more details. + * comment in kernfs_fop_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 4628edde2e7e..e29f4edf9572 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -125,9 +125,6 @@ static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, inode = kernfs_get_inode(sb, kn); kernfs_put(kn); - if (!inode) - return ERR_PTR(-ESTALE); - return d_obtain_alias(inode); } @@ -361,7 +358,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - uuid_gen(&sb->s_uuid); + uuid_t uuid; + uuid_gen(&uuid); + super_set_uuid(sb, uuid.b, sizeof(uuid)); down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); diff --git a/fs/libfs.c b/fs/libfs.c index eec6031b0155..0d14ae808fcf 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -23,6 +23,7 @@ #include <linux/fsnotify.h> #include <linux/unicode.h> #include <linux/fscrypt.h> +#include <linux/pidfs.h> #include <linux/uaccess.h> @@ -240,17 +241,22 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); -static void offset_set(struct dentry *dentry, u32 offset) +/* 0 is '.', 1 is '..', so always start with offset 2 or more */ +enum { + DIR_OFFSET_MIN = 2, +}; + +static void offset_set(struct dentry *dentry, long offset) { - dentry->d_fsdata = (void *)((uintptr_t)(offset)); + dentry->d_fsdata = (void *)offset; } -static u32 dentry2offset(struct dentry *dentry) +static long dentry2offset(struct dentry *dentry) { - return (u32)((uintptr_t)(dentry->d_fsdata)); + return (long)dentry->d_fsdata; } -static struct lock_class_key simple_offset_xa_lock; +static struct lock_class_key simple_offset_lock_class; /** * simple_offset_init - initialize an offset_ctx @@ -259,11 +265,9 @@ static struct lock_class_key simple_offset_xa_lock; */ void simple_offset_init(struct offset_ctx *octx) { - xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); - lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); - - /* 0 is '.', 1 is '..', so always start with offset 2 */ - octx->next_offset = 2; + mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); + lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); + octx->next_offset = DIR_OFFSET_MIN; } /** @@ -271,20 +275,19 @@ void simple_offset_init(struct offset_ctx *octx) * @octx: directory offset ctx to be updated * @dentry: new dentry being added * - * Returns zero on success. @so_ctx and the dentry offset are updated. + * Returns zero on success. @octx and the dentry's offset are updated. * Otherwise, a negative errno value is returned. */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { - static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); - u32 offset; + unsigned long offset; int ret; if (dentry2offset(dentry) != 0) return -EBUSY; - ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, - &octx->next_offset, GFP_KERNEL); + ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, + LONG_MAX, &octx->next_offset, GFP_KERNEL); if (ret < 0) return ret; @@ -300,17 +303,49 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) */ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) { - u32 offset; + long offset; offset = dentry2offset(dentry); if (offset == 0) return; - xa_erase(&octx->xa, offset); + mtree_erase(&octx->mt, offset); offset_set(dentry, 0); } /** + * simple_offset_empty - Check if a dentry can be unlinked + * @dentry: dentry to be tested + * + * Returns 0 if @dentry is a non-empty directory; otherwise returns 1. + */ +int simple_offset_empty(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct offset_ctx *octx; + struct dentry *child; + unsigned long index; + int ret = 1; + + if (!inode || !S_ISDIR(inode->i_mode)) + return ret; + + index = DIR_OFFSET_MIN; + octx = inode->i_op->get_offset_ctx(inode); + mt_for_each(&octx->mt, child, index, LONG_MAX) { + spin_lock(&child->d_lock); + if (simple_positive(child)) { + spin_unlock(&child->d_lock); + ret = 0; + break; + } + spin_unlock(&child->d_lock); + } + + return ret; +} + +/** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved * @old_dentry: dentry being moved @@ -327,8 +362,8 @@ int simple_offset_rename_exchange(struct inode *old_dir, { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); - u32 old_index = dentry2offset(old_dentry); - u32 new_index = dentry2offset(new_dentry); + long old_index = dentry2offset(old_dentry); + long new_index = dentry2offset(new_dentry); int ret; simple_offset_remove(old_ctx, old_dentry); @@ -354,9 +389,9 @@ int simple_offset_rename_exchange(struct inode *old_dir, out_restore: offset_set(old_dentry, old_index); - xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); + mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL); offset_set(new_dentry, new_index); - xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); + mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL); return ret; } @@ -369,7 +404,7 @@ out_restore: */ void simple_offset_destroy(struct offset_ctx *octx) { - xa_destroy(&octx->xa); + mtree_destroy(&octx->mt); } /** @@ -399,15 +434,16 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) /* In this case, ->private_data is protected by f_pos_lock */ file->private_data = NULL; - return vfs_setpos(file, offset, U32_MAX); + return vfs_setpos(file, offset, LONG_MAX); } -static struct dentry *offset_find_next(struct xa_state *xas) +static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) { + MA_STATE(mas, &octx->mt, offset, offset); struct dentry *child, *found = NULL; rcu_read_lock(); - child = xas_next_entry(xas, U32_MAX); + child = mas_find(&mas, LONG_MAX); if (!child) goto out; spin_lock(&child->d_lock); @@ -421,8 +457,8 @@ out: static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { - u32 offset = dentry2offset(dentry); struct inode *inode = d_inode(dentry); + long offset = dentry2offset(dentry); return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, inode->i_ino, fs_umode_to_dtype(inode->i_mode)); @@ -430,12 +466,11 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) { - struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); - XA_STATE(xas, &so_ctx->xa, ctx->pos); + struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dentry; while (true) { - dentry = offset_find_next(&xas); + dentry = offset_find_next(octx, ctx->pos); if (!dentry) return ERR_PTR(-ENOENT); @@ -444,8 +479,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) break; } + ctx->pos = dentry2offset(dentry) + 1; dput(dentry); - ctx->pos = xas.xa_index + 1; } return NULL; } @@ -481,7 +516,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx) return 0; /* In this case, ->private_data is protected by f_pos_lock */ - if (ctx->pos == 2) + if (ctx->pos == DIR_OFFSET_MIN) file->private_data = NULL; else if (file->private_data == ERR_PTR(-ENOENT)) return 0; @@ -1580,7 +1615,7 @@ EXPORT_SYMBOL(alloc_anon_inode); * All arguments are ignored and it just returns -EINVAL. */ int -simple_nosetlease(struct file *filp, int arg, struct file_lock **flp, +simple_nosetlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { return -EINVAL; @@ -1704,16 +1739,28 @@ bool is_empty_dir_inode(struct inode *inode) static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *dir = READ_ONCE(parent->d_inode); - const struct super_block *sb = dentry->d_sb; - const struct unicode_map *um = sb->s_encoding; - struct qstr qstr = QSTR_INIT(str, len); + const struct dentry *parent; + const struct inode *dir; char strbuf[DNAME_INLINE_LEN]; - int ret; + struct qstr qstr; + + /* + * Attempt a case-sensitive match first. It is cheaper and + * should cover most lookups, including all the sane + * applications that expect a case-sensitive filesystem. + * + * This comparison is safe under RCU because the caller + * guarantees the consistency between str and len. See + * __d_lookup_rcu_op_compare() for details. + */ + if (len == name->len && !memcmp(str, name->name, len)) + return 0; + parent = READ_ONCE(dentry->d_parent); + dir = READ_ONCE(parent->d_inode); if (!dir || !IS_CASEFOLDED(dir)) - goto fallback; + return 1; + /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry @@ -1724,20 +1771,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, if (len <= DNAME_INLINE_LEN - 1) { memcpy(strbuf, str, len); strbuf[len] = 0; - qstr.name = strbuf; + str = strbuf; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } - ret = utf8_strncasecmp(um, name, &qstr); - if (ret >= 0) - return ret; + qstr.len = len; + qstr.name = str; - if (sb_has_strict_encoding(sb)) - return -EINVAL; -fallback: - if (len != name->len) - return 1; - return !!memcmp(str, name->name, len); + return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } /** @@ -1752,7 +1793,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; - int ret = 0; + int ret; if (!dir || !IS_CASEFOLDED(dir)) return 0; @@ -1766,73 +1807,45 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, -}; -#endif - #ifdef CONFIG_FS_ENCRYPTION -static const struct dentry_operations generic_encrypted_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, +#endif }; #endif -#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) -static const struct dentry_operations generic_encrypted_ci_dentry_ops = { - .d_hash = generic_ci_d_hash, - .d_compare = generic_ci_d_compare, +#ifdef CONFIG_FS_ENCRYPTION +static const struct dentry_operations generic_encrypted_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, }; #endif /** - * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry - * @dentry: dentry to set ops on - * - * Casefolded directories need d_hash and d_compare set, so that the dentries - * contained in them are handled case-insensitively. Note that these operations - * are needed on the parent directory rather than on the dentries in it, and - * while the casefolding flag can be toggled on and off on an empty directory, - * dentry_operations can't be changed later. As a result, if the filesystem has - * casefolding support enabled at all, we have to give all dentries the - * casefolding operations even if their inode doesn't have the casefolding flag - * currently (and thus the casefolding ops would be no-ops for now). - * - * Encryption works differently in that the only dentry operation it needs is - * d_revalidate, which it only needs on dentries that have the no-key name flag. - * The no-key flag can't be set "later", so we don't have to worry about that. + * generic_set_sb_d_ops - helper for choosing the set of + * filesystem-wide dentry operations for the enabled features + * @sb: superblock to be configured * - * Finally, to maximize compatibility with overlayfs (which isn't compatible - * with certain dentry operations) and to avoid taking an unnecessary - * performance hit, we use custom dentry_operations for each possible - * combination rather than always installing all operations. + * Filesystems supporting casefolding and/or fscrypt can call this + * helper at mount-time to configure sb->s_d_op to best set of dentry + * operations required for the enabled features. The helper must be + * called after these have been configured, but before the root dentry + * is created. */ -void generic_set_encrypted_ci_d_ops(struct dentry *dentry) +void generic_set_sb_d_ops(struct super_block *sb) { -#ifdef CONFIG_FS_ENCRYPTION - bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; -#endif #if IS_ENABLED(CONFIG_UNICODE) - bool needs_ci_ops = dentry->d_sb->s_encoding; -#endif -#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) - if (needs_encrypt_ops && needs_ci_ops) { - d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); + if (sb->s_encoding) { + sb->s_d_op = &generic_ci_dentry_ops; return; } #endif #ifdef CONFIG_FS_ENCRYPTION - if (needs_encrypt_ops) { - d_set_d_op(dentry, &generic_encrypted_dentry_ops); - return; - } -#endif -#if IS_ENABLED(CONFIG_UNICODE) - if (needs_ci_ops) { - d_set_d_op(dentry, &generic_ci_dentry_ops); + if (sb->s_cop) { + sb->s_d_op = &generic_encrypted_dentry_ops; return; } #endif } -EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); +EXPORT_SYMBOL(generic_set_sb_d_ops); /** * inode_maybe_inc_iversion - increments i_version @@ -1973,3 +1986,144 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) return ts; } EXPORT_SYMBOL(simple_inode_init_ts); + +static inline struct dentry *get_stashed_dentry(struct dentry *stashed) +{ + struct dentry *dentry; + + guard(rcu)(); + dentry = READ_ONCE(stashed); + if (!dentry) + return NULL; + if (!lockref_get_not_dead(&dentry->d_lockref)) + return NULL; + return dentry; +} + +static struct dentry *prepare_anon_dentry(struct dentry **stashed, + unsigned long ino, + struct super_block *sb, + void *data) +{ + struct dentry *dentry; + struct inode *inode; + const struct stashed_operations *sops = sb->s_fs_info; + + dentry = d_alloc_anon(sb); + if (!dentry) + return ERR_PTR(-ENOMEM); + + inode = new_inode_pseudo(sb); + if (!inode) { + dput(dentry); + return ERR_PTR(-ENOMEM); + } + + inode->i_ino = ino; + inode->i_flags |= S_IMMUTABLE; + inode->i_mode = S_IFREG; + simple_inode_init_ts(inode); + sops->init_inode(inode, data); + + /* Notice when this is changed. */ + WARN_ON_ONCE(!S_ISREG(inode->i_mode)); + WARN_ON_ONCE(!IS_IMMUTABLE(inode)); + + /* Store address of location where dentry's supposed to be stashed. */ + dentry->d_fsdata = stashed; + + /* @data is now owned by the fs */ + d_instantiate(dentry, inode); + return dentry; +} + +static struct dentry *stash_dentry(struct dentry **stashed, + struct dentry *dentry) +{ + guard(rcu)(); + for (;;) { + struct dentry *old; + + /* Assume any old dentry was cleared out. */ + old = cmpxchg(stashed, NULL, dentry); + if (likely(!old)) + return dentry; + + /* Check if somebody else installed a reusable dentry. */ + if (lockref_get_not_dead(&old->d_lockref)) + return old; + + /* There's an old dead dentry there, try to take it over. */ + if (likely(try_cmpxchg(stashed, &old, dentry))) + return dentry; + } +} + +/** + * path_from_stashed - create path from stashed or new dentry + * @stashed: where to retrieve or stash dentry + * @ino: inode number to use + * @mnt: mnt of the filesystems to use + * @data: data to store in inode->i_private + * @path: path to create + * + * The function tries to retrieve a stashed dentry from @stashed. If the dentry + * is still valid then it will be reused. If the dentry isn't able the function + * will allocate a new dentry and inode. It will then check again whether it + * can reuse an existing dentry in case one has been added in the meantime or + * update @stashed with the newly added dentry. + * + * Special-purpose helper for nsfs and pidfs. + * + * Return: On success zero and on failure a negative error is returned. + */ +int path_from_stashed(struct dentry **stashed, unsigned long ino, + struct vfsmount *mnt, void *data, struct path *path) +{ + struct dentry *dentry; + const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; + + /* See if dentry can be reused. */ + path->dentry = get_stashed_dentry(*stashed); + if (path->dentry) { + sops->put_data(data); + goto out_path; + } + + /* Allocate a new dentry. */ + dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, data); + if (IS_ERR(dentry)) { + sops->put_data(data); + return PTR_ERR(dentry); + } + + /* Added a new dentry. @data is now owned by the filesystem. */ + path->dentry = stash_dentry(stashed, dentry); + if (path->dentry != dentry) + dput(dentry); + +out_path: + WARN_ON_ONCE(path->dentry->d_fsdata != stashed); + WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); + path->mnt = mntget(mnt); + return 0; +} + +void stashed_dentry_prune(struct dentry *dentry) +{ + struct dentry **stashed = dentry->d_fsdata; + struct inode *inode = d_inode(dentry); + + if (WARN_ON_ONCE(!stashed)) + return; + + if (!inode) + return; + + /* + * Only replace our own @dentry as someone else might've + * already cleared out @dentry and stashed their own + * dentry in there. + */ + cmpxchg(stashed, dentry, NULL); +} diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 8161667c976f..527458db4525 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -243,7 +243,7 @@ static void encode_nlm4_holder(struct xdr_stream *xdr, u64 l_offset, l_len; __be32 *p; - encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK); encode_int32(xdr, lock->svid); encode_netobj(xdr, lock->oh.data, lock->oh.len); @@ -270,7 +270,7 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) goto out_overflow; exclusive = be32_to_cpup(p++); lock->svid = be32_to_cpup(p); - fl->fl_pid = (pid_t)lock->svid; + fl->c.flc_pid = (pid_t)lock->svid; error = decode_netobj(xdr, &lock->oh); if (unlikely(error)) @@ -280,8 +280,8 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) if (unlikely(p == NULL)) goto out_overflow; - fl->fl_flags = FL_POSIX; - fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK; p = xdr_decode_hyper(p, &l_offset); xdr_decode_hyper(p, &l_len); nlm4svc_set_file_lock_range(fl, l_offset, l_len); @@ -357,7 +357,7 @@ static void nlm4_xdr_enc_testargs(struct rpc_rqst *req, const struct nlm_lock *lock = &args->lock; encode_cookie(xdr, &args->cookie); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm4_lock(xdr, lock); } @@ -380,7 +380,7 @@ static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req, encode_cookie(xdr, &args->cookie); encode_bool(xdr, args->block); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm4_lock(xdr, lock); encode_bool(xdr, args->reclaim); encode_int32(xdr, args->state); @@ -403,7 +403,7 @@ static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req, encode_cookie(xdr, &args->cookie); encode_bool(xdr, args->block); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm4_lock(xdr, lock); } diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 5d85715be763..a7e0519ec024 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -185,7 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) continue; if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; - if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)), fh) != 0) + if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->c.flc_file)), fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index fba6c7fa7474..cebcc283b7ce 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -133,7 +133,8 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); - memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); + memcpy(&lock->fh, NFS_FH(file_inode(fl->c.flc_file)), + sizeof(struct nfs_fh)); lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", @@ -142,7 +143,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) lock->svid = fl->fl_u.nfs_fl.owner->pid; lock->fl.fl_start = fl->fl_start; lock->fl.fl_end = fl->fl_end; - lock->fl.fl_type = fl->fl_type; + lock->fl.c.flc_type = fl->c.flc_type; } static void nlmclnt_release_lockargs(struct nlm_rqst *req) @@ -182,7 +183,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *dat call->a_callback_data = data; if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { - if (fl->fl_type != F_UNLCK) { + if (fl->c.flc_type != F_UNLCK) { call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; status = nlmclnt_lock(call, fl); } else @@ -432,13 +433,14 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) { int status; - status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST); + status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req, + NLMPROC_TEST); if (status < 0) goto out; switch (req->a_res.status) { case nlm_granted: - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; break; case nlm_lck_denied: /* @@ -446,8 +448,8 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) */ fl->fl_start = req->a_res.lock.fl.fl_start; fl->fl_end = req->a_res.lock.fl.fl_end; - fl->fl_type = req->a_res.lock.fl.fl_type; - fl->fl_pid = -req->a_res.lock.fl.fl_pid; + fl->c.flc_type = req->a_res.lock.fl.c.flc_type; + fl->c.flc_pid = -req->a_res.lock.fl.c.flc_pid; break; default: status = nlm_stat_to_errno(req->a_res.status); @@ -485,14 +487,15 @@ static const struct file_lock_operations nlmclnt_lock_ops = { static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host) { fl->fl_u.nfs_fl.state = 0; - fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host, fl->fl_owner); + fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host, + fl->c.flc_owner); INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list); fl->fl_ops = &nlmclnt_lock_ops; } static int do_vfs_lock(struct file_lock *fl) { - return locks_lock_file_wait(fl->fl_file, fl); + return locks_lock_file_wait(fl->c.flc_file, fl); } /* @@ -518,12 +521,12 @@ static int do_vfs_lock(struct file_lock *fl) static int nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) { - const struct cred *cred = nfs_file_cred(fl->fl_file); + const struct cred *cred = nfs_file_cred(fl->c.flc_file); struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; struct nlm_wait block; - unsigned char fl_flags = fl->fl_flags; - unsigned char fl_type; + unsigned char flags = fl->c.flc_flags; + unsigned char type; __be32 b_status; int status = -ENOLCK; @@ -531,9 +534,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) goto out; req->a_args.state = nsm_local_state; - fl->fl_flags |= FL_ACCESS; + fl->c.flc_flags |= FL_ACCESS; status = do_vfs_lock(fl); - fl->fl_flags = fl_flags; + fl->c.flc_flags = flags; if (status < 0) goto out; @@ -591,11 +594,11 @@ again: goto again; } /* Ensure the resulting lock will get added to granted list */ - fl->fl_flags |= FL_SLEEP; + fl->c.flc_flags |= FL_SLEEP; if (do_vfs_lock(fl) < 0) printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); up_read(&host->h_rwsem); - fl->fl_flags = fl_flags; + fl->c.flc_flags = flags; status = 0; } if (status < 0) @@ -605,7 +608,7 @@ again: * cases NLM_LCK_DENIED is returned for a permanent error. So * turn it into an ENOLCK. */ - if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP)) + if (resp->status == nlm_lck_denied && (flags & FL_SLEEP)) status = -ENOLCK; else status = nlm_stat_to_errno(resp->status); @@ -622,13 +625,13 @@ out_unlock: req->a_host->h_addrlen, req->a_res.status); dprintk("lockd: lock attempt ended in fatal error.\n" " Attempting to unlock.\n"); - fl_type = fl->fl_type; - fl->fl_type = F_UNLCK; + type = fl->c.flc_type; + fl->c.flc_type = F_UNLCK; down_read(&host->h_rwsem); do_vfs_lock(fl); up_read(&host->h_rwsem); - fl->fl_type = fl_type; - fl->fl_flags = fl_flags; + fl->c.flc_type = type; + fl->c.flc_flags = flags; nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); return status; } @@ -651,12 +654,14 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl, nlmclnt_setlockargs(req, fl); req->a_args.reclaim = 1; - status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK); + status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req, + NLMPROC_LOCK); if (status >= 0 && req->a_res.status == nlm_granted) return 0; printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d " - "(errno %d, status %d)\n", fl->fl_pid, + "(errno %d, status %d)\n", + fl->c.flc_pid, status, ntohl(req->a_res.status)); /* @@ -683,26 +688,26 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; int status; - unsigned char fl_flags = fl->fl_flags; + unsigned char flags = fl->c.flc_flags; /* * Note: the server is supposed to either grant us the unlock * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either * case, we want to unlock. */ - fl->fl_flags |= FL_EXISTS; + fl->c.flc_flags |= FL_EXISTS; down_read(&host->h_rwsem); status = do_vfs_lock(fl); up_read(&host->h_rwsem); - fl->fl_flags = fl_flags; + fl->c.flc_flags = flags; if (status == -ENOENT) { status = 0; goto out; } refcount_inc(&req->a_count); - status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, - NLMPROC_UNLOCK, &nlmclnt_unlock_ops); + status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req, + NLMPROC_UNLOCK, &nlmclnt_unlock_ops); if (status < 0) goto out; @@ -795,8 +800,8 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl req->a_args.block = block; refcount_inc(&req->a_count); - status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, - NLMPROC_CANCEL, &nlmclnt_cancel_ops); + status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req, + NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status == 0 && req->a_res.status == nlm_lck_denied) status = -ENOLCK; nlmclnt_release_call(req); diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 4df62f635529..a3e97278b997 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -238,7 +238,7 @@ static void encode_nlm_holder(struct xdr_stream *xdr, u32 l_offset, l_len; __be32 *p; - encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK); encode_int32(xdr, lock->svid); encode_netobj(xdr, lock->oh.data, lock->oh.len); @@ -265,7 +265,7 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) goto out_overflow; exclusive = be32_to_cpup(p++); lock->svid = be32_to_cpup(p); - fl->fl_pid = (pid_t)lock->svid; + fl->c.flc_pid = (pid_t)lock->svid; error = decode_netobj(xdr, &lock->oh); if (unlikely(error)) @@ -275,8 +275,8 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) if (unlikely(p == NULL)) goto out_overflow; - fl->fl_flags = FL_POSIX; - fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK; l_offset = be32_to_cpup(p++); l_len = be32_to_cpup(p); end = l_offset + l_len - 1; @@ -357,7 +357,7 @@ static void nlm_xdr_enc_testargs(struct rpc_rqst *req, const struct nlm_lock *lock = &args->lock; encode_cookie(xdr, &args->cookie); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm_lock(xdr, lock); } @@ -380,7 +380,7 @@ static void nlm_xdr_enc_lockargs(struct rpc_rqst *req, encode_cookie(xdr, &args->cookie); encode_bool(xdr, args->block); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm_lock(xdr, lock); encode_bool(xdr, args->reclaim); encode_int32(xdr, args->state); @@ -403,7 +403,7 @@ static void nlm_xdr_enc_cancargs(struct rpc_rqst *req, encode_cookie(xdr, &args->cookie); encode_bool(xdr, args->block); - encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK); encode_nlm_lock(xdr, lock); } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index b72023a6b4c1..8a72c418cdcc 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -52,16 +52,16 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_flags = FL_POSIX; - lock->fl.fl_file = file->f_file[mode]; - lock->fl.fl_pid = current->tgid; + lock->fl.c.flc_flags = FL_POSIX; + lock->fl.c.flc_file = file->f_file[mode]; + lock->fl.c.flc_pid = current->tgid; lock->fl.fl_start = (loff_t)lock->lock_start; lock->fl.fl_end = lock->lock_len ? (loff_t)(lock->lock_start + lock->lock_len - 1) : OFFSET_MAX; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); - if (!lock->fl.fl_owner) { + if (!lock->fl.c.flc_owner) { /* lockowner allocation has failed */ nlmsvc_release_host(host); return nlm_lck_denied_nolocks; @@ -106,7 +106,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; - test_owner = argp->lock.fl.fl_owner; + test_owner = argp->lock.fl.c.flc_owner; /* Now check for conflicting locks */ resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie); if (resp->status == nlm_drop_reply) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 2dc10900ad1c..1f2149db10f2 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -150,16 +150,17 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) struct file_lock *fl; dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n", - file, lock->fl.fl_pid, + file, lock->fl.c.flc_pid, (long long)lock->fl.fl_start, - (long long)lock->fl.fl_end, lock->fl.fl_type); + (long long)lock->fl.fl_end, + lock->fl.c.flc_type); spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { fl = &block->b_call->a_args.lock.fl; dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", - block->b_file, fl->fl_pid, + block->b_file, fl->c.flc_pid, (long long)fl->fl_start, - (long long)fl->fl_end, fl->fl_type, + (long long)fl->fl_end, fl->c.flc_type, nlmdbg_cookie2a(&block->b_call->a_args.cookie)); if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) { kref_get(&block->b_count); @@ -244,7 +245,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, goto failed_free; /* Set notifier function for VFS, and init args */ - call->a_args.lock.fl.fl_flags |= FL_SLEEP; + call->a_args.lock.fl.c.flc_flags |= FL_SLEEP; call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations; nlmclnt_next_cookie(&call->a_args.cookie); @@ -402,14 +403,14 @@ static struct nlm_lockowner *nlmsvc_find_lockowner(struct nlm_host *host, pid_t void nlmsvc_release_lockowner(struct nlm_lock *lock) { - if (lock->fl.fl_owner) - nlmsvc_put_lockowner(lock->fl.fl_owner); + if (lock->fl.c.flc_owner) + nlmsvc_put_lockowner(lock->fl.c.flc_owner); } void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host, pid_t pid) { - fl->fl_owner = nlmsvc_find_lockowner(host, pid); + fl->c.flc_owner = nlmsvc_find_lockowner(host, pid); } /* @@ -425,7 +426,7 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock) /* set default data area */ call->a_args.lock.oh.data = call->a_owner; - call->a_args.lock.svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid; + call->a_args.lock.svid = ((struct nlm_lockowner *) lock->fl.c.flc_owner)->pid; if (lock->oh.len > NLMCLNT_OHSIZE) { void *data = kmalloc(lock->oh.len, GFP_KERNEL); @@ -489,7 +490,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", inode->i_sb->s_id, inode->i_ino, - lock->fl.fl_type, lock->fl.fl_pid, + lock->fl.c.flc_type, + lock->fl.c.flc_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, wait); @@ -512,7 +514,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; lock = &block->b_call->a_args.lock; } else - lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.c.flc_flags &= ~FL_SLEEP; if (block->b_flags & B_QUEUED) { dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n", @@ -560,10 +562,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, spin_unlock(&nlm_blocked_lock); if (!wait) - lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.c.flc_flags &= ~FL_SLEEP; mode = lock_to_openmode(&lock->fl); error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); - lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.c.flc_flags &= ~FL_SLEEP; dprintk("lockd: vfs_lock_file returned %d\n", error); switch (error) { @@ -616,7 +618,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, - lock->fl.fl_type, + lock->fl.c.flc_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -636,19 +638,19 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - if (lock->fl.fl_type == F_UNLCK) { + if (lock->fl.c.flc_type == F_UNLCK) { ret = nlm_granted; goto out; } dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n", - lock->fl.fl_type, (long long)lock->fl.fl_start, + lock->fl.c.flc_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); conflock->caller = "somehost"; /* FIXME */ conflock->len = strlen(conflock->caller); conflock->oh.len = 0; /* don't return OH info */ - conflock->svid = lock->fl.fl_pid; - conflock->fl.fl_type = lock->fl.fl_type; + conflock->svid = lock->fl.c.flc_pid; + conflock->fl.c.flc_type = lock->fl.c.flc_type; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; locks_release_private(&lock->fl); @@ -673,21 +675,21 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, - lock->fl.fl_pid, + lock->fl.c.flc_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); /* First, cancel any lock that might be there */ nlmsvc_cancel_blocked(net, file, lock); - lock->fl.fl_type = F_UNLCK; - lock->fl.fl_file = file->f_file[O_RDONLY]; - if (lock->fl.fl_file) - error = vfs_lock_file(lock->fl.fl_file, F_SETLK, + lock->fl.c.flc_type = F_UNLCK; + lock->fl.c.flc_file = file->f_file[O_RDONLY]; + if (lock->fl.c.flc_file) + error = vfs_lock_file(lock->fl.c.flc_file, F_SETLK, &lock->fl, NULL); - lock->fl.fl_file = file->f_file[O_WRONLY]; - if (lock->fl.fl_file) - error |= vfs_lock_file(lock->fl.fl_file, F_SETLK, + lock->fl.c.flc_file = file->f_file[O_WRONLY]; + if (lock->fl.c.flc_file) + error |= vfs_lock_file(lock->fl.c.flc_file, F_SETLK, &lock->fl, NULL); return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; @@ -710,7 +712,7 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, - lock->fl.fl_pid, + lock->fl.c.flc_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -863,12 +865,12 @@ nlmsvc_grant_blocked(struct nlm_block *block) /* vfs_lock_file() can mangle fl_start and fl_end, but we need * them unchanged for the GRANT_MSG */ - lock->fl.fl_flags |= FL_SLEEP; + lock->fl.c.flc_flags |= FL_SLEEP; fl_start = lock->fl.fl_start; fl_end = lock->fl.fl_end; mode = lock_to_openmode(&lock->fl); error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); - lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.c.flc_flags &= ~FL_SLEEP; lock->fl.fl_start = fl_start; lock->fl.fl_end = fl_end; @@ -993,8 +995,8 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) /* Client doesn't want it, just unlock it */ nlmsvc_unlink_block(block); fl = &block->b_call->a_args.lock.fl; - fl->fl_type = F_UNLCK; - error = vfs_lock_file(fl->fl_file, F_SETLK, fl, NULL); + fl->c.flc_type = F_UNLCK; + error = vfs_lock_file(fl->c.flc_file, F_SETLK, fl, NULL); if (error) pr_warn("lockd: unable to unlock lock rejected by client!\n"); break; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 32784f508c81..a03220e66ce0 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -77,12 +77,12 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Set up the missing parts of the file_lock structure */ mode = lock_to_openmode(&lock->fl); - lock->fl.fl_flags = FL_POSIX; - lock->fl.fl_file = file->f_file[mode]; - lock->fl.fl_pid = current->tgid; + lock->fl.c.flc_flags = FL_POSIX; + lock->fl.c.flc_file = file->f_file[mode]; + lock->fl.c.flc_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); - if (!lock->fl.fl_owner) { + if (!lock->fl.c.flc_owner) { /* lockowner allocation has failed */ nlmsvc_release_host(host); return nlm_lck_denied_nolocks; @@ -127,7 +127,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; - test_owner = argp->lock.fl.fl_owner; + test_owner = argp->lock.fl.c.flc_owner; /* Now check for conflicting locks */ resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie)); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index e3b6229e7ae5..9103896164f6 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -73,7 +73,7 @@ static inline unsigned int file_hash(struct nfs_fh *f) int lock_to_openmode(struct file_lock *lock) { - return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY; + return lock_is_write(lock) ? O_WRONLY : O_RDONLY; } /* @@ -181,18 +181,18 @@ static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl) struct file_lock lock; locks_init_lock(&lock); - lock.fl_type = F_UNLCK; + lock.c.flc_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; - lock.fl_owner = fl->fl_owner; - lock.fl_pid = fl->fl_pid; - lock.fl_flags = FL_POSIX; + lock.c.flc_owner = fl->c.flc_owner; + lock.c.flc_pid = fl->c.flc_pid; + lock.c.flc_flags = FL_POSIX; - lock.fl_file = file->f_file[O_RDONLY]; - if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL)) + lock.c.flc_file = file->f_file[O_RDONLY]; + if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL)) goto out_err; - lock.fl_file = file->f_file[O_WRONLY]; - if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL)) + lock.c.flc_file = file->f_file[O_WRONLY]; + if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL)) goto out_err; return 0; out_err: @@ -218,14 +218,14 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, again: file->f_locks = 0; spin_lock(&flctx->flc_lock); - list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + for_each_file_lock(fl, &flctx->flc_posix) { if (fl->fl_lmops != &nlmsvc_lock_operations) continue; /* update current lock count */ file->f_locks++; - lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host; + lockhost = ((struct nlm_lockowner *) fl->c.flc_owner)->host; if (match(lockhost, host)) { spin_unlock(&flctx->flc_lock); @@ -272,7 +272,7 @@ nlm_file_inuse(struct nlm_file *file) if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); - list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + for_each_file_lock(fl, &flctx->flc_posix) { if (fl->fl_lmops == &nlmsvc_lock_operations) { spin_unlock(&flctx->flc_lock); return 1; diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 2fb5748dae0c..adfcce2bf11b 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -88,8 +88,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) return false; locks_init_lock(fl); - fl->fl_flags = FL_POSIX; - fl->fl_type = F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_type = F_RDLCK; end = start + len - 1; fl->fl_start = s32_to_loff_t(start); if (len == 0 || end < 0) @@ -107,7 +107,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) s32 start, len; /* exclusive */ - if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0) + if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0) return false; if (xdr_stream_encode_u32(xdr, lock->svid) < 0) return false; @@ -164,7 +164,7 @@ nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; return true; } @@ -184,7 +184,7 @@ nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) return false; if (xdr_stream_decode_u32(xdr, &argp->state) < 0) @@ -209,7 +209,7 @@ nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; return true; } @@ -223,7 +223,7 @@ nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; - argp->lock.fl.fl_type = F_UNLCK; + argp->lock.fl.c.flc_type = F_UNLCK; return true; } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 5fcbf30cd275..3d28b9c3ed15 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -89,8 +89,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) return false; locks_init_lock(fl); - fl->fl_flags = FL_POSIX; - fl->fl_type = F_RDLCK; + fl->c.flc_flags = FL_POSIX; + fl->c.flc_type = F_RDLCK; nlm4svc_set_file_lock_range(fl, lock->lock_start, lock->lock_len); return true; } @@ -102,7 +102,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) s64 start, len; /* exclusive */ - if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0) + if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0) return false; if (xdr_stream_encode_u32(xdr, lock->svid) < 0) return false; @@ -159,7 +159,7 @@ nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; return true; } @@ -179,7 +179,7 @@ nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) return false; if (xdr_stream_decode_u32(xdr, &argp->state) < 0) @@ -204,7 +204,7 @@ nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; if (exclusive) - argp->lock.fl.fl_type = F_WRLCK; + argp->lock.fl.c.flc_type = F_WRLCK; return true; } @@ -218,7 +218,7 @@ nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (!svcxdr_decode_lock(xdr, &argp->lock)) return false; - argp->lock.fl.fl_type = F_UNLCK; + argp->lock.fl.c.flc_type = F_UNLCK; return true; } diff --git a/fs/locks.c b/fs/locks.c index cc7c117ee192..90c8746874de 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -48,7 +48,6 @@ * children. * */ - #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> @@ -70,24 +69,28 @@ #include <linux/uaccess.h> -#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) -#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) -#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) -#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) -#define IS_REMOTELCK(fl) (fl->fl_pid <= 0) +static struct file_lock *file_lock(struct file_lock_core *flc) +{ + return container_of(flc, struct file_lock, c); +} + +static struct file_lease *file_lease(struct file_lock_core *flc) +{ + return container_of(flc, struct file_lease, c); +} -static bool lease_breaking(struct file_lock *fl) +static bool lease_breaking(struct file_lease *fl) { - return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); + return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); } -static int target_leasetype(struct file_lock *fl) +static int target_leasetype(struct file_lease *fl) { - if (fl->fl_flags & FL_UNLOCK_PENDING) + if (fl->c.flc_flags & FL_UNLOCK_PENDING) return F_UNLCK; - if (fl->fl_flags & FL_DOWNGRADE_PENDING) + if (fl->c.flc_flags & FL_DOWNGRADE_PENDING) return F_RDLCK; - return fl->fl_type; + return fl->c.flc_type; } static int leases_enable = 1; @@ -168,6 +171,7 @@ static DEFINE_SPINLOCK(blocked_lock_lock); static struct kmem_cache *flctx_cache __ro_after_init; static struct kmem_cache *filelock_cache __ro_after_init; +static struct kmem_cache *filelease_cache __ro_after_init; static struct file_lock_context * locks_get_lock_context(struct inode *inode, int type) @@ -204,11 +208,12 @@ out: static void locks_dump_ctx_list(struct list_head *list, char *list_type) { - struct file_lock *fl; + struct file_lock_core *flc; - list_for_each_entry(fl, list, fl_list) { - pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); - } + list_for_each_entry(flc, list, flc_list) + pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", + list_type, flc->flc_owner, flc->flc_flags, + flc->flc_type, flc->flc_pid); } static void @@ -229,19 +234,19 @@ locks_check_ctx_lists(struct inode *inode) } static void -locks_check_ctx_file_list(struct file *filp, struct list_head *list, - char *list_type) +locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type) { - struct file_lock *fl; + struct file_lock_core *flc; struct inode *inode = file_inode(filp); - list_for_each_entry(fl, list, fl_list) - if (fl->fl_file == filp) + list_for_each_entry(flc, list, flc_list) + if (flc->flc_file == filp) pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx " " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino, - fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); + flc->flc_owner, flc->flc_flags, + flc->flc_type, flc->flc_pid); } void @@ -255,13 +260,13 @@ locks_free_lock_context(struct inode *inode) } } -static void locks_init_lock_heads(struct file_lock *fl) +static void locks_init_lock_heads(struct file_lock_core *flc) { - INIT_HLIST_NODE(&fl->fl_link); - INIT_LIST_HEAD(&fl->fl_list); - INIT_LIST_HEAD(&fl->fl_blocked_requests); - INIT_LIST_HEAD(&fl->fl_blocked_member); - init_waitqueue_head(&fl->fl_wait); + INIT_HLIST_NODE(&flc->flc_link); + INIT_LIST_HEAD(&flc->flc_list); + INIT_LIST_HEAD(&flc->flc_blocked_requests); + INIT_LIST_HEAD(&flc->flc_blocked_member); + init_waitqueue_head(&flc->flc_wait); } /* Allocate an empty lock structure. */ @@ -270,19 +275,33 @@ struct file_lock *locks_alloc_lock(void) struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); if (fl) - locks_init_lock_heads(fl); + locks_init_lock_heads(&fl->c); return fl; } EXPORT_SYMBOL_GPL(locks_alloc_lock); +/* Allocate an empty lock structure. */ +struct file_lease *locks_alloc_lease(void) +{ + struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL); + + if (fl) + locks_init_lock_heads(&fl->c); + + return fl; +} +EXPORT_SYMBOL_GPL(locks_alloc_lease); + void locks_release_private(struct file_lock *fl) { - BUG_ON(waitqueue_active(&fl->fl_wait)); - BUG_ON(!list_empty(&fl->fl_list)); - BUG_ON(!list_empty(&fl->fl_blocked_requests)); - BUG_ON(!list_empty(&fl->fl_blocked_member)); - BUG_ON(!hlist_unhashed(&fl->fl_link)); + struct file_lock_core *flc = &fl->c; + + BUG_ON(waitqueue_active(&flc->flc_wait)); + BUG_ON(!list_empty(&flc->flc_list)); + BUG_ON(!list_empty(&flc->flc_blocked_requests)); + BUG_ON(!list_empty(&flc->flc_blocked_member)); + BUG_ON(!hlist_unhashed(&flc->flc_link)); if (fl->fl_ops) { if (fl->fl_ops->fl_release_private) @@ -292,8 +311,8 @@ void locks_release_private(struct file_lock *fl) if (fl->fl_lmops) { if (fl->fl_lmops->lm_put_owner) { - fl->fl_lmops->lm_put_owner(fl->fl_owner); - fl->fl_owner = NULL; + fl->fl_lmops->lm_put_owner(flc->flc_owner); + flc->flc_owner = NULL; } fl->fl_lmops = NULL; } @@ -309,16 +328,15 @@ EXPORT_SYMBOL_GPL(locks_release_private); * %true: @owner has at least one blocker * %false: @owner has no blockers */ -bool locks_owner_has_blockers(struct file_lock_context *flctx, - fl_owner_t owner) +bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner) { - struct file_lock *fl; + struct file_lock_core *flc; spin_lock(&flctx->flc_lock); - list_for_each_entry(fl, &flctx->flc_posix, fl_list) { - if (fl->fl_owner != owner) + list_for_each_entry(flc, &flctx->flc_posix, flc_list) { + if (flc->flc_owner != owner) continue; - if (!list_empty(&fl->fl_blocked_requests)) { + if (!list_empty(&flc->flc_blocked_requests)) { spin_unlock(&flctx->flc_lock); return true; } @@ -336,35 +354,52 @@ void locks_free_lock(struct file_lock *fl) } EXPORT_SYMBOL(locks_free_lock); +/* Free a lease which is not in use. */ +void locks_free_lease(struct file_lease *fl) +{ + kmem_cache_free(filelease_cache, fl); +} +EXPORT_SYMBOL(locks_free_lease); + static void locks_dispose_list(struct list_head *dispose) { - struct file_lock *fl; + struct file_lock_core *flc; while (!list_empty(dispose)) { - fl = list_first_entry(dispose, struct file_lock, fl_list); - list_del_init(&fl->fl_list); - locks_free_lock(fl); + flc = list_first_entry(dispose, struct file_lock_core, flc_list); + list_del_init(&flc->flc_list); + if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) + locks_free_lease(file_lease(flc)); + else + locks_free_lock(file_lock(flc)); } } void locks_init_lock(struct file_lock *fl) { memset(fl, 0, sizeof(struct file_lock)); - locks_init_lock_heads(fl); + locks_init_lock_heads(&fl->c); } EXPORT_SYMBOL(locks_init_lock); +void locks_init_lease(struct file_lease *fl) +{ + memset(fl, 0, sizeof(*fl)); + locks_init_lock_heads(&fl->c); +} +EXPORT_SYMBOL(locks_init_lease); + /* * Initialize a new lock from an existing file_lock structure. */ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { - new->fl_owner = fl->fl_owner; - new->fl_pid = fl->fl_pid; - new->fl_file = NULL; - new->fl_flags = fl->fl_flags; - new->fl_type = fl->fl_type; + new->c.flc_owner = fl->c.flc_owner; + new->c.flc_pid = fl->c.flc_pid; + new->c.flc_file = NULL; + new->c.flc_flags = fl->c.flc_flags; + new->c.flc_type = fl->c.flc_type; new->fl_start = fl->fl_start; new->fl_end = fl->fl_end; new->fl_lmops = fl->fl_lmops; @@ -372,7 +407,7 @@ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) if (fl->fl_lmops) { if (fl->fl_lmops->lm_get_owner) - fl->fl_lmops->lm_get_owner(fl->fl_owner); + fl->fl_lmops->lm_get_owner(fl->c.flc_owner); } } EXPORT_SYMBOL(locks_copy_conflock); @@ -384,7 +419,7 @@ void locks_copy_lock(struct file_lock *new, struct file_lock *fl) locks_copy_conflock(new, fl); - new->fl_file = fl->fl_file; + new->c.flc_file = fl->c.flc_file; new->fl_ops = fl->fl_ops; if (fl->fl_ops) { @@ -400,15 +435,17 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl) /* * As ctx->flc_lock is held, new requests cannot be added to - * ->fl_blocked_requests, so we don't need a lock to check if it + * ->flc_blocked_requests, so we don't need a lock to check if it * is empty. */ - if (list_empty(&fl->fl_blocked_requests)) + if (list_empty(&fl->c.flc_blocked_requests)) return; spin_lock(&blocked_lock_lock); - list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests); - list_for_each_entry(f, &new->fl_blocked_requests, fl_blocked_member) - f->fl_blocker = new; + list_splice_init(&fl->c.flc_blocked_requests, + &new->c.flc_blocked_requests); + list_for_each_entry(f, &new->c.flc_blocked_requests, + c.flc_blocked_member) + f->c.flc_blocker = &new->c; spin_unlock(&blocked_lock_lock); } @@ -429,21 +466,21 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) { locks_init_lock(fl); - fl->fl_file = filp; - fl->fl_owner = filp; - fl->fl_pid = current->tgid; - fl->fl_flags = FL_FLOCK; - fl->fl_type = type; + fl->c.flc_file = filp; + fl->c.flc_owner = filp; + fl->c.flc_pid = current->tgid; + fl->c.flc_flags = FL_FLOCK; + fl->c.flc_type = type; fl->fl_end = OFFSET_MAX; } -static int assign_type(struct file_lock *fl, int type) +static int assign_type(struct file_lock_core *flc, int type) { switch (type) { case F_RDLCK: case F_WRLCK: case F_UNLCK: - fl->fl_type = type; + flc->flc_type = type; break; default: return -EINVAL; @@ -488,14 +525,14 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, } else fl->fl_end = OFFSET_MAX; - fl->fl_owner = current->files; - fl->fl_pid = current->tgid; - fl->fl_file = filp; - fl->fl_flags = FL_POSIX; + fl->c.flc_owner = current->files; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = filp; + fl->c.flc_flags = FL_POSIX; fl->fl_ops = NULL; fl->fl_lmops = NULL; - return assign_type(fl, l->l_type); + return assign_type(&fl->c, l->l_type); } /* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX @@ -516,16 +553,16 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, /* default lease lock manager operations */ static bool -lease_break_callback(struct file_lock *fl) +lease_break_callback(struct file_lease *fl) { kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); return false; } static void -lease_setup(struct file_lock *fl, void **priv) +lease_setup(struct file_lease *fl, void **priv) { - struct file *filp = fl->fl_file; + struct file *filp = fl->c.flc_file; struct fasync_struct *fa = *priv; /* @@ -539,7 +576,7 @@ lease_setup(struct file_lock *fl, void **priv) __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); } -static const struct lock_manager_operations lease_manager_ops = { +static const struct lease_manager_operations lease_manager_ops = { .lm_break = lease_break_callback, .lm_change = lease_modify, .lm_setup = lease_setup, @@ -548,27 +585,24 @@ static const struct lock_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, int type, struct file_lock *fl) +static int lease_init(struct file *filp, int type, struct file_lease *fl) { - if (assign_type(fl, type) != 0) + if (assign_type(&fl->c, type) != 0) return -EINVAL; - fl->fl_owner = filp; - fl->fl_pid = current->tgid; + fl->c.flc_owner = filp; + fl->c.flc_pid = current->tgid; - fl->fl_file = filp; - fl->fl_flags = FL_LEASE; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; - fl->fl_ops = NULL; + fl->c.flc_file = filp; + fl->c.flc_flags = FL_LEASE; fl->fl_lmops = &lease_manager_ops; return 0; } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lock *lease_alloc(struct file *filp, int type) +static struct file_lease *lease_alloc(struct file *filp, int type) { - struct file_lock *fl = locks_alloc_lock(); + struct file_lease *fl = locks_alloc_lease(); int error = -ENOMEM; if (fl == NULL) @@ -576,7 +610,7 @@ static struct file_lock *lease_alloc(struct file *filp, int type) error = lease_init(filp, type, fl); if (error) { - locks_free_lock(fl); + locks_free_lease(fl); return ERR_PTR(error); } return fl; @@ -593,26 +627,26 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2) /* * Check whether two locks have the same owner. */ -static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) +static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2) { - return fl1->fl_owner == fl2->fl_owner; + return fl1->flc_owner == fl2->flc_owner; } /* Must be called with the flc_lock held! */ -static void locks_insert_global_locks(struct file_lock *fl) +static void locks_insert_global_locks(struct file_lock_core *flc) { struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list); percpu_rwsem_assert_held(&file_rwsem); spin_lock(&fll->lock); - fl->fl_link_cpu = smp_processor_id(); - hlist_add_head(&fl->fl_link, &fll->hlist); + flc->flc_link_cpu = smp_processor_id(); + hlist_add_head(&flc->flc_link, &fll->hlist); spin_unlock(&fll->lock); } /* Must be called with the flc_lock held! */ -static void locks_delete_global_locks(struct file_lock *fl) +static void locks_delete_global_locks(struct file_lock_core *flc) { struct file_lock_list_struct *fll; @@ -623,33 +657,33 @@ static void locks_delete_global_locks(struct file_lock *fl) * is done while holding the flc_lock, and new insertions into the list * also require that it be held. */ - if (hlist_unhashed(&fl->fl_link)) + if (hlist_unhashed(&flc->flc_link)) return; - fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu); + fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu); spin_lock(&fll->lock); - hlist_del_init(&fl->fl_link); + hlist_del_init(&flc->flc_link); spin_unlock(&fll->lock); } static unsigned long -posix_owner_key(struct file_lock *fl) +posix_owner_key(struct file_lock_core *flc) { - return (unsigned long)fl->fl_owner; + return (unsigned long) flc->flc_owner; } -static void locks_insert_global_blocked(struct file_lock *waiter) +static void locks_insert_global_blocked(struct file_lock_core *waiter) { lockdep_assert_held(&blocked_lock_lock); - hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); + hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter)); } -static void locks_delete_global_blocked(struct file_lock *waiter) +static void locks_delete_global_blocked(struct file_lock_core *waiter) { lockdep_assert_held(&blocked_lock_lock); - hash_del(&waiter->fl_link); + hash_del(&waiter->flc_link); } /* Remove waiter from blocker's block list. @@ -657,41 +691,39 @@ static void locks_delete_global_blocked(struct file_lock *waiter) * * Must be called with blocked_lock_lock held. */ -static void __locks_delete_block(struct file_lock *waiter) +static void __locks_unlink_block(struct file_lock_core *waiter) { locks_delete_global_blocked(waiter); - list_del_init(&waiter->fl_blocked_member); + list_del_init(&waiter->flc_blocked_member); } -static void __locks_wake_up_blocks(struct file_lock *blocker) +static void __locks_wake_up_blocks(struct file_lock_core *blocker) { - while (!list_empty(&blocker->fl_blocked_requests)) { - struct file_lock *waiter; + while (!list_empty(&blocker->flc_blocked_requests)) { + struct file_lock_core *waiter; + struct file_lock *fl; - waiter = list_first_entry(&blocker->fl_blocked_requests, - struct file_lock, fl_blocked_member); - __locks_delete_block(waiter); - if (waiter->fl_lmops && waiter->fl_lmops->lm_notify) - waiter->fl_lmops->lm_notify(waiter); + waiter = list_first_entry(&blocker->flc_blocked_requests, + struct file_lock_core, flc_blocked_member); + + fl = file_lock(waiter); + __locks_unlink_block(waiter); + if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) && + fl->fl_lmops && fl->fl_lmops->lm_notify) + fl->fl_lmops->lm_notify(fl); else - wake_up(&waiter->fl_wait); + locks_wake_up(fl); /* - * The setting of fl_blocker to NULL marks the "done" + * The setting of flc_blocker to NULL marks the "done" * point in deleting a block. Paired with acquire at the top * of locks_delete_block(). */ - smp_store_release(&waiter->fl_blocker, NULL); + smp_store_release(&waiter->flc_blocker, NULL); } } -/** - * locks_delete_block - stop waiting for a file lock - * @waiter: the lock which was waiting - * - * lockd/nfsd need to disconnect the lock while working on it. - */ -int locks_delete_block(struct file_lock *waiter) +static int __locks_delete_block(struct file_lock_core *waiter) { int status = -ENOENT; @@ -716,24 +748,35 @@ int locks_delete_block(struct file_lock *waiter) * no new locks can be inserted into its fl_blocked_requests list, and * can avoid doing anything further if the list is empty. */ - if (!smp_load_acquire(&waiter->fl_blocker) && - list_empty(&waiter->fl_blocked_requests)) + if (!smp_load_acquire(&waiter->flc_blocker) && + list_empty(&waiter->flc_blocked_requests)) return status; spin_lock(&blocked_lock_lock); - if (waiter->fl_blocker) + if (waiter->flc_blocker) status = 0; __locks_wake_up_blocks(waiter); - __locks_delete_block(waiter); + __locks_unlink_block(waiter); /* * The setting of fl_blocker to NULL marks the "done" point in deleting * a block. Paired with acquire at the top of this function. */ - smp_store_release(&waiter->fl_blocker, NULL); + smp_store_release(&waiter->flc_blocker, NULL); spin_unlock(&blocked_lock_lock); return status; } + +/** + * locks_delete_block - stop waiting for a file lock + * @waiter: the lock which was waiting + * + * lockd/nfsd need to disconnect the lock while working on it. + */ +int locks_delete_block(struct file_lock *waiter) +{ + return __locks_delete_block(&waiter->c); +} EXPORT_SYMBOL(locks_delete_block); /* Insert waiter into blocker's block list. @@ -751,26 +794,28 @@ EXPORT_SYMBOL(locks_delete_block); * waiters, and add beneath any waiter that blocks the new waiter. * Thus wakeups don't happen until needed. */ -static void __locks_insert_block(struct file_lock *blocker, - struct file_lock *waiter, - bool conflict(struct file_lock *, - struct file_lock *)) +static void __locks_insert_block(struct file_lock_core *blocker, + struct file_lock_core *waiter, + bool conflict(struct file_lock_core *, + struct file_lock_core *)) { - struct file_lock *fl; - BUG_ON(!list_empty(&waiter->fl_blocked_member)); + struct file_lock_core *flc; + BUG_ON(!list_empty(&waiter->flc_blocked_member)); new_blocker: - list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member) - if (conflict(fl, waiter)) { - blocker = fl; + list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member) + if (conflict(flc, waiter)) { + blocker = flc; goto new_blocker; } - waiter->fl_blocker = blocker; - list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests); - if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) + waiter->flc_blocker = blocker; + list_add_tail(&waiter->flc_blocked_member, + &blocker->flc_blocked_requests); + + if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX) locks_insert_global_blocked(waiter); - /* The requests in waiter->fl_blocked are known to conflict with + /* The requests in waiter->flc_blocked are known to conflict with * waiter, but might not conflict with blocker, or the requests * and lock which block it. So they all need to be woken. */ @@ -778,10 +823,10 @@ new_blocker: } /* Must be called with flc_lock held. */ -static void locks_insert_block(struct file_lock *blocker, - struct file_lock *waiter, - bool conflict(struct file_lock *, - struct file_lock *)) +static void locks_insert_block(struct file_lock_core *blocker, + struct file_lock_core *waiter, + bool conflict(struct file_lock_core *, + struct file_lock_core *)) { spin_lock(&blocked_lock_lock); __locks_insert_block(blocker, waiter, conflict); @@ -793,7 +838,7 @@ static void locks_insert_block(struct file_lock *blocker, * * Must be called with the inode->flc_lock held! */ -static void locks_wake_up_blocks(struct file_lock *blocker) +static void locks_wake_up_blocks(struct file_lock_core *blocker) { /* * Avoid taking global lock if list is empty. This is safe since new @@ -802,7 +847,7 @@ static void locks_wake_up_blocks(struct file_lock *blocker) * fl_blocked_requests list does not require the flc_lock, so we must * recheck list_empty() after acquiring the blocked_lock_lock. */ - if (list_empty(&blocker->fl_blocked_requests)) + if (list_empty(&blocker->flc_blocked_requests)) return; spin_lock(&blocked_lock_lock); @@ -811,39 +856,39 @@ static void locks_wake_up_blocks(struct file_lock *blocker) } static void -locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before) +locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before) { - list_add_tail(&fl->fl_list, before); + list_add_tail(&fl->flc_list, before); locks_insert_global_locks(fl); } static void -locks_unlink_lock_ctx(struct file_lock *fl) +locks_unlink_lock_ctx(struct file_lock_core *fl) { locks_delete_global_locks(fl); - list_del_init(&fl->fl_list); + list_del_init(&fl->flc_list); locks_wake_up_blocks(fl); } static void -locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose) +locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose) { locks_unlink_lock_ctx(fl); if (dispose) - list_add(&fl->fl_list, dispose); + list_add(&fl->flc_list, dispose); else - locks_free_lock(fl); + locks_free_lock(file_lock(fl)); } /* Determine if lock sys_fl blocks lock caller_fl. Common functionality * checks for shared/exclusive status of overlapping locks. */ -static bool locks_conflict(struct file_lock *caller_fl, - struct file_lock *sys_fl) +static bool locks_conflict(struct file_lock_core *caller_flc, + struct file_lock_core *sys_flc) { - if (sys_fl->fl_type == F_WRLCK) + if (sys_flc->flc_type == F_WRLCK) return true; - if (caller_fl->fl_type == F_WRLCK) + if (caller_flc->flc_type == F_WRLCK) return true; return false; } @@ -851,20 +896,23 @@ static bool locks_conflict(struct file_lock *caller_fl, /* Determine if lock sys_fl blocks lock caller_fl. POSIX specific * checking before calling the locks_conflict(). */ -static bool posix_locks_conflict(struct file_lock *caller_fl, - struct file_lock *sys_fl) +static bool posix_locks_conflict(struct file_lock_core *caller_flc, + struct file_lock_core *sys_flc) { + struct file_lock *caller_fl = file_lock(caller_flc); + struct file_lock *sys_fl = file_lock(sys_flc); + /* POSIX locks owned by the same process do not conflict with * each other. */ - if (posix_same_owner(caller_fl, sys_fl)) + if (posix_same_owner(caller_flc, sys_flc)) return false; /* Check whether they overlap */ if (!locks_overlap(caller_fl, sys_fl)) return false; - return locks_conflict(caller_fl, sys_fl); + return locks_conflict(caller_flc, sys_flc); } /* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK @@ -873,28 +921,31 @@ static bool posix_locks_conflict(struct file_lock *caller_fl, static bool posix_test_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) { + struct file_lock_core *caller = &caller_fl->c; + struct file_lock_core *sys = &sys_fl->c; + /* F_UNLCK checks any locks on the same fd. */ - if (caller_fl->fl_type == F_UNLCK) { - if (!posix_same_owner(caller_fl, sys_fl)) + if (lock_is_unlock(caller_fl)) { + if (!posix_same_owner(caller, sys)) return false; return locks_overlap(caller_fl, sys_fl); } - return posix_locks_conflict(caller_fl, sys_fl); + return posix_locks_conflict(caller, sys); } /* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific * checking before calling the locks_conflict(). */ -static bool flock_locks_conflict(struct file_lock *caller_fl, - struct file_lock *sys_fl) +static bool flock_locks_conflict(struct file_lock_core *caller_flc, + struct file_lock_core *sys_flc) { /* FLOCK locks referring to the same filp do not conflict with * each other. */ - if (caller_fl->fl_file == sys_fl->fl_file) + if (caller_flc->flc_file == sys_flc->flc_file) return false; - return locks_conflict(caller_fl, sys_fl); + return locks_conflict(caller_flc, sys_flc); } void @@ -908,13 +959,13 @@ posix_test_lock(struct file *filp, struct file_lock *fl) ctx = locks_inode_context(inode); if (!ctx || list_empty_careful(&ctx->flc_posix)) { - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; return; } retry: spin_lock(&ctx->flc_lock); - list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { + list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) { if (!posix_test_locks_conflict(fl, cfl)) continue; if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable @@ -930,7 +981,7 @@ retry: locks_copy_conflock(fl, cfl); goto out; } - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; out: spin_unlock(&ctx->flc_lock); return; @@ -972,25 +1023,27 @@ EXPORT_SYMBOL(posix_test_lock); #define MAX_DEADLK_ITERATIONS 10 -/* Find a lock that the owner of the given block_fl is blocking on. */ -static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) +/* Find a lock that the owner of the given @blocker is blocking on. */ +static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker) { - struct file_lock *fl; + struct file_lock_core *flc; - hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) { - if (posix_same_owner(fl, block_fl)) { - while (fl->fl_blocker) - fl = fl->fl_blocker; - return fl; + hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) { + if (posix_same_owner(flc, blocker)) { + while (flc->flc_blocker) + flc = flc->flc_blocker; + return flc; } } return NULL; } /* Must be called with the blocked_lock_lock held! */ -static int posix_locks_deadlock(struct file_lock *caller_fl, - struct file_lock *block_fl) +static bool posix_locks_deadlock(struct file_lock *caller_fl, + struct file_lock *block_fl) { + struct file_lock_core *caller = &caller_fl->c; + struct file_lock_core *blocker = &block_fl->c; int i = 0; lockdep_assert_held(&blocked_lock_lock); @@ -999,16 +1052,16 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, * This deadlock detector can't reasonably detect deadlocks with * FL_OFDLCK locks, since they aren't owned by a process, per-se. */ - if (IS_OFDLCK(caller_fl)) - return 0; + if (caller->flc_flags & FL_OFDLCK) + return false; - while ((block_fl = what_owner_is_waiting_for(block_fl))) { + while ((blocker = what_owner_is_waiting_for(blocker))) { if (i++ > MAX_DEADLK_ITERATIONS) - return 0; - if (posix_same_owner(caller_fl, block_fl)) - return 1; + return false; + if (posix_same_owner(caller, blocker)) + return true; } - return 0; + return false; } /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks @@ -1027,14 +1080,14 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) bool found = false; LIST_HEAD(dispose); - ctx = locks_get_lock_context(inode, request->fl_type); + ctx = locks_get_lock_context(inode, request->c.flc_type); if (!ctx) { - if (request->fl_type != F_UNLCK) + if (request->c.flc_type != F_UNLCK) return -ENOMEM; - return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0; + return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0; } - if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { + if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) { new_fl = locks_alloc_lock(); if (!new_fl) return -ENOMEM; @@ -1042,41 +1095,41 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); - if (request->fl_flags & FL_ACCESS) + if (request->c.flc_flags & FL_ACCESS) goto find_conflict; - list_for_each_entry(fl, &ctx->flc_flock, fl_list) { - if (request->fl_file != fl->fl_file) + list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) { + if (request->c.flc_file != fl->c.flc_file) continue; - if (request->fl_type == fl->fl_type) + if (request->c.flc_type == fl->c.flc_type) goto out; found = true; - locks_delete_lock_ctx(fl, &dispose); + locks_delete_lock_ctx(&fl->c, &dispose); break; } - if (request->fl_type == F_UNLCK) { - if ((request->fl_flags & FL_EXISTS) && !found) + if (lock_is_unlock(request)) { + if ((request->c.flc_flags & FL_EXISTS) && !found) error = -ENOENT; goto out; } find_conflict: - list_for_each_entry(fl, &ctx->flc_flock, fl_list) { - if (!flock_locks_conflict(request, fl)) + list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) { + if (!flock_locks_conflict(&request->c, &fl->c)) continue; error = -EAGAIN; - if (!(request->fl_flags & FL_SLEEP)) + if (!(request->c.flc_flags & FL_SLEEP)) goto out; error = FILE_LOCK_DEFERRED; - locks_insert_block(fl, request, flock_locks_conflict); + locks_insert_block(&fl->c, &request->c, flock_locks_conflict); goto out; } - if (request->fl_flags & FL_ACCESS) + if (request->c.flc_flags & FL_ACCESS) goto out; locks_copy_lock(new_fl, request); locks_move_blocks(new_fl, request); - locks_insert_lock_ctx(new_fl, &ctx->flc_flock); + locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock); new_fl = NULL; error = 0; @@ -1105,9 +1158,9 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, void *owner; void (*func)(void); - ctx = locks_get_lock_context(inode, request->fl_type); + ctx = locks_get_lock_context(inode, request->c.flc_type); if (!ctx) - return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM; + return lock_is_unlock(request) ? 0 : -ENOMEM; /* * We may need two file_lock structures for this operation, @@ -1115,8 +1168,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, * * In some cases we can be sure, that no new locks will be needed */ - if (!(request->fl_flags & FL_ACCESS) && - (request->fl_type != F_UNLCK || + if (!(request->c.flc_flags & FL_ACCESS) && + (request->c.flc_type != F_UNLCK || request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { new_fl = locks_alloc_lock(); new_fl2 = locks_alloc_lock(); @@ -1130,9 +1183,9 @@ retry: * there are any, either return error or put the request on the * blocker's list of waiters and the global blocked_hash. */ - if (request->fl_type != F_UNLCK) { - list_for_each_entry(fl, &ctx->flc_posix, fl_list) { - if (!posix_locks_conflict(request, fl)) + if (request->c.flc_type != F_UNLCK) { + list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) { + if (!posix_locks_conflict(&request->c, &fl->c)) continue; if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable && (*fl->fl_lmops->lm_lock_expirable)(fl)) { @@ -1148,7 +1201,7 @@ retry: if (conflock) locks_copy_conflock(conflock, fl); error = -EAGAIN; - if (!(request->fl_flags & FL_SLEEP)) + if (!(request->c.flc_flags & FL_SLEEP)) goto out; /* * Deadlock detection and insertion into the blocked @@ -1160,10 +1213,10 @@ retry: * Ensure that we don't find any locks blocked on this * request during deadlock detection. */ - __locks_wake_up_blocks(request); + __locks_wake_up_blocks(&request->c); if (likely(!posix_locks_deadlock(request, fl))) { error = FILE_LOCK_DEFERRED; - __locks_insert_block(fl, request, + __locks_insert_block(&fl->c, &request->c, posix_locks_conflict); } spin_unlock(&blocked_lock_lock); @@ -1173,22 +1226,22 @@ retry: /* If we're just looking for a conflict, we're done. */ error = 0; - if (request->fl_flags & FL_ACCESS) + if (request->c.flc_flags & FL_ACCESS) goto out; /* Find the first old lock with the same owner as the new lock */ - list_for_each_entry(fl, &ctx->flc_posix, fl_list) { - if (posix_same_owner(request, fl)) + list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) { + if (posix_same_owner(&request->c, &fl->c)) break; } /* Process locks with this owner. */ - list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) { - if (!posix_same_owner(request, fl)) + list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) { + if (!posix_same_owner(&request->c, &fl->c)) break; /* Detect adjacent or overlapping regions (if same lock type) */ - if (request->fl_type == fl->fl_type) { + if (request->c.flc_type == fl->c.flc_type) { /* In all comparisons of start vs end, use * "start - 1" rather than "end + 1". If end * is OFFSET_MAX, end + 1 will become negative. @@ -1215,7 +1268,7 @@ retry: else request->fl_end = fl->fl_end; if (added) { - locks_delete_lock_ctx(fl, &dispose); + locks_delete_lock_ctx(&fl->c, &dispose); continue; } request = fl; @@ -1228,7 +1281,7 @@ retry: continue; if (fl->fl_start > request->fl_end) break; - if (request->fl_type == F_UNLCK) + if (lock_is_unlock(request)) added = true; if (fl->fl_start < request->fl_start) left = fl; @@ -1244,7 +1297,7 @@ retry: * one (This may happen several times). */ if (added) { - locks_delete_lock_ctx(fl, &dispose); + locks_delete_lock_ctx(&fl->c, &dispose); continue; } /* @@ -1261,8 +1314,9 @@ retry: locks_move_blocks(new_fl, request); request = new_fl; new_fl = NULL; - locks_insert_lock_ctx(request, &fl->fl_list); - locks_delete_lock_ctx(fl, &dispose); + locks_insert_lock_ctx(&request->c, + &fl->c.flc_list); + locks_delete_lock_ctx(&fl->c, &dispose); added = true; } } @@ -1279,8 +1333,8 @@ retry: error = 0; if (!added) { - if (request->fl_type == F_UNLCK) { - if (request->fl_flags & FL_EXISTS) + if (lock_is_unlock(request)) { + if (request->c.flc_flags & FL_EXISTS) error = -ENOENT; goto out; } @@ -1291,7 +1345,7 @@ retry: } locks_copy_lock(new_fl, request); locks_move_blocks(new_fl, request); - locks_insert_lock_ctx(new_fl, &fl->fl_list); + locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list); fl = new_fl; new_fl = NULL; } @@ -1303,14 +1357,14 @@ retry: left = new_fl2; new_fl2 = NULL; locks_copy_lock(left, right); - locks_insert_lock_ctx(left, &fl->fl_list); + locks_insert_lock_ctx(&left->c, &fl->c.flc_list); } right->fl_start = request->fl_end + 1; - locks_wake_up_blocks(right); + locks_wake_up_blocks(&right->c); } if (left) { left->fl_end = request->fl_start - 1; - locks_wake_up_blocks(left); + locks_wake_up_blocks(&left->c); } out: spin_unlock(&ctx->flc_lock); @@ -1364,8 +1418,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = posix_lock_inode(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, - list_empty(&fl->fl_blocked_member)); + error = wait_event_interruptible(fl->c.flc_wait, + list_empty(&fl->c.flc_blocked_member)); if (error) break; } @@ -1373,37 +1427,37 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) return error; } -static void lease_clear_pending(struct file_lock *fl, int arg) +static void lease_clear_pending(struct file_lease *fl, int arg) { switch (arg) { case F_UNLCK: - fl->fl_flags &= ~FL_UNLOCK_PENDING; + fl->c.flc_flags &= ~FL_UNLOCK_PENDING; fallthrough; case F_RDLCK: - fl->fl_flags &= ~FL_DOWNGRADE_PENDING; + fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING; } } /* We already had a lease on this file; just change its type */ -int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose) +int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) { - int error = assign_type(fl, arg); + int error = assign_type(&fl->c, arg); if (error) return error; lease_clear_pending(fl, arg); - locks_wake_up_blocks(fl); + locks_wake_up_blocks(&fl->c); if (arg == F_UNLCK) { - struct file *filp = fl->fl_file; + struct file *filp = fl->c.flc_file; f_delown(filp); filp->f_owner.signum = 0; - fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync); + fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync); if (fl->fl_fasync != NULL) { printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); fl->fl_fasync = NULL; } - locks_delete_lock_ctx(fl, dispose); + locks_delete_lock_ctx(&fl->c, dispose); } return 0; } @@ -1420,11 +1474,11 @@ static bool past_time(unsigned long then) static void time_out_leases(struct inode *inode, struct list_head *dispose) { struct file_lock_context *ctx = inode->i_flctx; - struct file_lock *fl, *tmp; + struct file_lease *fl, *tmp; lockdep_assert_held(&ctx->flc_lock); - list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) { + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) { trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(fl, F_RDLCK, dispose); @@ -1433,38 +1487,40 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) } } -static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) +static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc) { bool rc; + struct file_lease *lease = file_lease(lc); + struct file_lease *breaker = file_lease(bc); if (lease->fl_lmops->lm_breaker_owns_lease && lease->fl_lmops->lm_breaker_owns_lease(lease)) return false; - if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) { + if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) { rc = false; goto trace; } - if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) { + if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) { rc = false; goto trace; } - rc = locks_conflict(breaker, lease); + rc = locks_conflict(bc, lc); trace: trace_leases_conflict(rc, lease, breaker); return rc; } static bool -any_leases_conflict(struct inode *inode, struct file_lock *breaker) +any_leases_conflict(struct inode *inode, struct file_lease *breaker) { struct file_lock_context *ctx = inode->i_flctx; - struct file_lock *fl; + struct file_lock_core *flc; lockdep_assert_held(&ctx->flc_lock); - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (leases_conflict(fl, breaker)) + list_for_each_entry(flc, &ctx->flc_lease, flc_list) { + if (leases_conflict(flc, &breaker->c)) return true; } return false; @@ -1487,7 +1543,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; struct file_lock_context *ctx; - struct file_lock *new_fl, *fl, *tmp; + struct file_lease *new_fl, *fl, *tmp; unsigned long break_time; int want_write = (mode & O_ACCMODE) != O_RDONLY; LIST_HEAD(dispose); @@ -1495,7 +1551,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); if (IS_ERR(new_fl)) return PTR_ERR(new_fl); - new_fl->fl_flags = type; + new_fl->c.flc_flags = type; /* typically we will check that ctx is non-NULL before calling */ ctx = locks_inode_context(inode); @@ -1519,22 +1575,22 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) break_time++; /* so that 0 means no break time */ } - list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) { - if (!leases_conflict(fl, new_fl)) + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) { + if (!leases_conflict(&fl->c, &new_fl->c)) continue; if (want_write) { - if (fl->fl_flags & FL_UNLOCK_PENDING) + if (fl->c.flc_flags & FL_UNLOCK_PENDING) continue; - fl->fl_flags |= FL_UNLOCK_PENDING; + fl->c.flc_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; } else { if (lease_breaking(fl)) continue; - fl->fl_flags |= FL_DOWNGRADE_PENDING; + fl->c.flc_flags |= FL_DOWNGRADE_PENDING; fl->fl_downgrade_time = break_time; } if (fl->fl_lmops->lm_break(fl)) - locks_delete_lock_ctx(fl, &dispose); + locks_delete_lock_ctx(&fl->c, &dispose); } if (list_empty(&ctx->flc_lease)) @@ -1547,26 +1603,26 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) } restart: - fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list); + fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list); break_time = fl->fl_break_time; if (break_time != 0) break_time -= jiffies; if (break_time == 0) break_time++; - locks_insert_block(fl, new_fl, leases_conflict); + locks_insert_block(&fl->c, &new_fl->c, leases_conflict); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); - error = wait_event_interruptible_timeout(new_fl->fl_wait, - list_empty(&new_fl->fl_blocked_member), - break_time); + error = wait_event_interruptible_timeout(new_fl->c.flc_wait, + list_empty(&new_fl->c.flc_blocked_member), + break_time); percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); - locks_delete_block(new_fl); + __locks_delete_block(&new_fl->c); if (error >= 0) { /* * Wait for the next conflicting lease that has not been @@ -1583,7 +1639,7 @@ out: percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); free_lock: - locks_free_lock(new_fl); + locks_free_lease(new_fl); return error; } EXPORT_SYMBOL(__break_lease); @@ -1601,14 +1657,14 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time) { bool has_lease = false; struct file_lock_context *ctx; - struct file_lock *fl; + struct file_lock_core *flc; ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); - fl = list_first_entry_or_null(&ctx->flc_lease, - struct file_lock, fl_list); - if (fl && (fl->fl_type == F_WRLCK)) + flc = list_first_entry_or_null(&ctx->flc_lease, + struct file_lock_core, flc_list); + if (flc && flc->flc_type == F_WRLCK) has_lease = true; spin_unlock(&ctx->flc_lock); } @@ -1643,7 +1699,7 @@ EXPORT_SYMBOL(lease_get_mtime); */ int fcntl_getlease(struct file *filp) { - struct file_lock *fl; + struct file_lease *fl; struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; @@ -1654,8 +1710,8 @@ int fcntl_getlease(struct file *filp) percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_file != filp) + list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { + if (fl->c.flc_file != filp) continue; type = target_leasetype(fl); break; @@ -1715,12 +1771,12 @@ check_conflicting_open(struct file *filp, const int arg, int flags) } static int -generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv) +generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv) { - struct file_lock *fl, *my_fl = NULL, *lease; + struct file_lease *fl, *my_fl = NULL, *lease; struct inode *inode = file_inode(filp); struct file_lock_context *ctx; - bool is_deleg = (*flp)->fl_flags & FL_DELEG; + bool is_deleg = (*flp)->c.flc_flags & FL_DELEG; int error; LIST_HEAD(dispose); @@ -1746,7 +1802,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); - error = check_conflicting_open(filp, arg, lease->fl_flags); + error = check_conflicting_open(filp, arg, lease->c.flc_flags); if (error) goto out; @@ -1759,9 +1815,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri * except for this filp. */ error = -EAGAIN; - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_file == filp && - fl->fl_owner == lease->fl_owner) { + list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { + if (fl->c.flc_file == filp && + fl->c.flc_owner == lease->c.flc_owner) { my_fl = fl; continue; } @@ -1776,7 +1832,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri * Modifying our existing lease is OK, but no getting a * new lease if someone else is opening for write: */ - if (fl->fl_flags & FL_UNLOCK_PENDING) + if (fl->c.flc_flags & FL_UNLOCK_PENDING) goto out; } @@ -1792,7 +1848,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri if (!leases_enable) goto out; - locks_insert_lock_ctx(lease, &ctx->flc_lease); + locks_insert_lock_ctx(&lease->c, &ctx->flc_lease); /* * The check in break_lease() is lockless. It's possible for another * open to race in after we did the earlier check for a conflicting @@ -1803,9 +1859,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri * precedes these checks. */ smp_mb(); - error = check_conflicting_open(filp, arg, lease->fl_flags); + error = check_conflicting_open(filp, arg, lease->c.flc_flags); if (error) { - locks_unlink_lock_ctx(lease); + locks_unlink_lock_ctx(&lease->c); goto out; } @@ -1826,7 +1882,7 @@ out: static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; - struct file_lock *fl, *victim = NULL; + struct file_lease *fl, *victim = NULL; struct inode *inode = file_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1839,9 +1895,9 @@ static int generic_delete_lease(struct file *filp, void *owner) percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_file == filp && - fl->fl_owner == owner) { + list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { + if (fl->c.flc_file == filp && + fl->c.flc_owner == owner) { victim = fl; break; } @@ -1866,21 +1922,9 @@ static int generic_delete_lease(struct file *filp, void *owner) * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). */ -int generic_setlease(struct file *filp, int arg, struct file_lock **flp, +int generic_setlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { - struct inode *inode = file_inode(filp); - vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode); - int error; - - if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE)) - return -EACCES; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - error = security_file_lock(filp, arg); - if (error) - return error; - switch (arg) { case F_UNLCK: return generic_delete_lease(filp, *priv); @@ -1913,7 +1957,7 @@ lease_notifier_chain_init(void) } static inline void -setlease_notifier(int arg, struct file_lock *lease) +setlease_notifier(int arg, struct file_lease *lease) { if (arg != F_UNLCK) srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); @@ -1931,6 +1975,19 @@ void lease_unregister_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +int +kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) +{ + if (lease) + setlease_notifier(arg, *lease); + if (filp->f_op->setlease) + return filp->f_op->setlease(filp, arg, lease, priv); + else + return generic_setlease(filp, arg, lease, priv); +} +EXPORT_SYMBOL_GPL(kernel_setlease); + /** * vfs_setlease - sets a lease on an open file * @filp: file pointer @@ -1949,20 +2006,26 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier); * may be NULL if the lm_setup operation doesn't require it. */ int -vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv) +vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { - if (lease) - setlease_notifier(arg, *lease); - if (filp->f_op->setlease) - return filp->f_op->setlease(filp, arg, lease, priv); - else - return generic_setlease(filp, arg, lease, priv); + struct inode *inode = file_inode(filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode); + int error; + + if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE)) + return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + error = security_file_lock(filp, arg); + if (error) + return error; + return kernel_setlease(filp, arg, lease, priv); } EXPORT_SYMBOL_GPL(vfs_setlease); static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) { - struct file_lock *fl; + struct file_lease *fl; struct fasync_struct *new; int error; @@ -1972,14 +2035,14 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) new = fasync_alloc(); if (!new) { - locks_free_lock(fl); + locks_free_lease(fl); return -ENOMEM; } new->fa_fd = fd; error = vfs_setlease(filp, arg, &fl, (void **)&new); if (fl) - locks_free_lock(fl); + locks_free_lease(fl); if (new) fasync_free(new); return error; @@ -2017,8 +2080,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, - list_empty(&fl->fl_blocked_member)); + error = wait_event_interruptible(fl->c.flc_wait, + list_empty(&fl->c.flc_blocked_member)); if (error) break; } @@ -2036,7 +2099,7 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int res = 0; - switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) { case FL_POSIX: res = posix_lock_inode_wait(inode, fl); break; @@ -2098,13 +2161,13 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) flock_make_lock(f.file, &fl, type); - error = security_file_lock(f.file, fl.fl_type); + error = security_file_lock(f.file, fl.c.flc_type); if (error) goto out_putf; can_sleep = !(cmd & LOCK_NB); if (can_sleep) - fl.fl_flags |= FL_SLEEP; + fl.c.flc_flags |= FL_SLEEP; if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, @@ -2130,7 +2193,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { - WARN_ON_ONCE(filp != fl->fl_file); + WARN_ON_ONCE(filp != fl->c.flc_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); @@ -2145,25 +2208,28 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); * * Used to translate a fl_pid into a namespace virtual pid number */ -static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) +static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns) { pid_t vnr; struct pid *pid; - if (IS_OFDLCK(fl)) + if (fl->flc_flags & FL_OFDLCK) return -1; - if (IS_REMOTELCK(fl)) - return fl->fl_pid; + + /* Remote locks report a negative pid value */ + if (fl->flc_pid <= 0) + return fl->flc_pid; + /* * If the flock owner process is dead and its pid has been already * freed, the translation below won't work, but we still want to show * flock owner pid number in init pidns. */ if (ns == &init_pid_ns) - return (pid_t)fl->fl_pid; + return (pid_t) fl->flc_pid; rcu_read_lock(); - pid = find_pid_ns(fl->fl_pid, &init_pid_ns); + pid = find_pid_ns(fl->flc_pid, &init_pid_ns); vnr = pid_nr_ns(pid, ns); rcu_read_unlock(); return vnr; @@ -2171,7 +2237,7 @@ static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); + flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current)); #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -2186,19 +2252,19 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; flock->l_whence = 0; - flock->l_type = fl->fl_type; + flock->l_type = fl->c.flc_type; return 0; } #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); + flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current)); flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; flock->l_whence = 0; - flock->l_type = fl->fl_type; + flock->l_type = fl->c.flc_type; } #endif @@ -2227,16 +2293,16 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) if (flock->l_pid != 0) goto out; - fl->fl_flags |= FL_OFDLCK; - fl->fl_owner = filp; + fl->c.flc_flags |= FL_OFDLCK; + fl->c.flc_owner = filp; } error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = fl->fl_type; - if (fl->fl_type != F_UNLCK) { + flock->l_type = fl->c.flc_type; + if (fl->c.flc_type != F_UNLCK) { error = posix_lock_to_flock(flock, fl); if (error) goto out; @@ -2283,7 +2349,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { - WARN_ON_ONCE(filp != fl->fl_file); + WARN_ON_ONCE(filp != fl->c.flc_file); if (filp->f_op->lock) return filp->f_op->lock(filp, cmd, fl); else @@ -2296,7 +2362,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, { int error; - error = security_file_lock(filp, fl->fl_type); + error = security_file_lock(filp, fl->c.flc_type); if (error) return error; @@ -2304,8 +2370,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, error = vfs_lock_file(filp, cmd, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, - list_empty(&fl->fl_blocked_member)); + error = wait_event_interruptible(fl->c.flc_wait, + list_empty(&fl->c.flc_blocked_member)); if (error) break; } @@ -2318,13 +2384,13 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, static int check_fmode_for_setlk(struct file_lock *fl) { - switch (fl->fl_type) { + switch (fl->c.flc_type) { case F_RDLCK: - if (!(fl->fl_file->f_mode & FMODE_READ)) + if (!(fl->c.flc_file->f_mode & FMODE_READ)) return -EBADF; break; case F_WRLCK: - if (!(fl->fl_file->f_mode & FMODE_WRITE)) + if (!(fl->c.flc_file->f_mode & FMODE_WRITE)) return -EBADF; } return 0; @@ -2363,8 +2429,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, goto out; cmd = F_SETLK; - file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = filp; + file_lock->c.flc_flags |= FL_OFDLCK; + file_lock->c.flc_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2372,11 +2438,11 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, goto out; cmd = F_SETLKW; - file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = filp; + file_lock->c.flc_flags |= FL_OFDLCK; + file_lock->c.flc_owner = filp; fallthrough; case F_SETLKW: - file_lock->fl_flags |= FL_SLEEP; + file_lock->c.flc_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); @@ -2386,8 +2452,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, * lock that was just acquired. There is no need to do that when we're * unlocking though, or for OFD locks. */ - if (!error && file_lock->fl_type != F_UNLCK && - !(file_lock->fl_flags & FL_OFDLCK)) { + if (!error && file_lock->c.flc_type != F_UNLCK && + !(file_lock->c.flc_flags & FL_OFDLCK)) { struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between @@ -2398,7 +2464,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, f = files_lookup_fd_locked(files, fd); spin_unlock(&files->file_lock); if (f != filp) { - file_lock->fl_type = F_UNLCK; + file_lock->c.flc_type = F_UNLCK; error = do_lock_file_wait(filp, cmd, file_lock); WARN_ON_ONCE(error); error = -EBADF; @@ -2437,16 +2503,16 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) if (flock->l_pid != 0) goto out; - fl->fl_flags |= FL_OFDLCK; - fl->fl_owner = filp; + fl->c.flc_flags |= FL_OFDLCK; + fl->c.flc_owner = filp; } error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = fl->fl_type; - if (fl->fl_type != F_UNLCK) + flock->l_type = fl->c.flc_type; + if (fl->c.flc_type != F_UNLCK) posix_lock_to_flock64(flock, fl); out: @@ -2486,8 +2552,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, goto out; cmd = F_SETLK64; - file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = filp; + file_lock->c.flc_flags |= FL_OFDLCK; + file_lock->c.flc_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2495,11 +2561,11 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, goto out; cmd = F_SETLKW64; - file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = filp; + file_lock->c.flc_flags |= FL_OFDLCK; + file_lock->c.flc_owner = filp; fallthrough; case F_SETLKW64: - file_lock->fl_flags |= FL_SLEEP; + file_lock->c.flc_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); @@ -2509,8 +2575,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, * lock that was just acquired. There is no need to do that when we're * unlocking though, or for OFD locks. */ - if (!error && file_lock->fl_type != F_UNLCK && - !(file_lock->fl_flags & FL_OFDLCK)) { + if (!error && file_lock->c.flc_type != F_UNLCK && + !(file_lock->c.flc_flags & FL_OFDLCK)) { struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between @@ -2521,7 +2587,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, f = files_lookup_fd_locked(files, fd); spin_unlock(&files->file_lock); if (f != filp) { - file_lock->fl_type = F_UNLCK; + file_lock->c.flc_type = F_UNLCK; error = do_lock_file_wait(filp, cmd, file_lock); WARN_ON_ONCE(error); error = -EBADF; @@ -2555,13 +2621,13 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) return; locks_init_lock(&lock); - lock.fl_type = F_UNLCK; - lock.fl_flags = FL_POSIX | FL_CLOSE; + lock.c.flc_type = F_UNLCK; + lock.c.flc_flags = FL_POSIX | FL_CLOSE; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; - lock.fl_owner = owner; - lock.fl_pid = current->tgid; - lock.fl_file = filp; + lock.c.flc_owner = owner; + lock.c.flc_pid = current->tgid; + lock.c.flc_file = filp; lock.fl_ops = NULL; lock.fl_lmops = NULL; @@ -2584,7 +2650,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) return; flock_make_lock(filp, &fl, F_UNLCK); - fl.fl_flags |= FL_CLOSE; + fl.c.flc_flags |= FL_CLOSE; if (filp->f_op->flock) filp->f_op->flock(filp, F_SETLKW, &fl); @@ -2599,7 +2665,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) static void locks_remove_lease(struct file *filp, struct file_lock_context *ctx) { - struct file_lock *fl, *tmp; + struct file_lease *fl, *tmp; LIST_HEAD(dispose); if (list_empty(&ctx->flc_lease)) @@ -2607,8 +2673,8 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx) percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); - list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) - if (filp == fl->fl_file) + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) + if (filp == fl->c.flc_file) lease_modify(fl, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); @@ -2652,7 +2718,7 @@ void locks_remove_file(struct file *filp) */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { - WARN_ON_ONCE(filp != fl->fl_file); + WARN_ON_ONCE(filp != fl->c.flc_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; @@ -2691,69 +2757,73 @@ struct locks_iterator { loff_t li_pos; }; -static void lock_get_status(struct seq_file *f, struct file_lock *fl, +static void lock_get_status(struct seq_file *f, struct file_lock_core *flc, loff_t id, char *pfx, int repeat) { struct inode *inode = NULL; - unsigned int fl_pid; + unsigned int pid; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); - int type; + int type = flc->flc_type; + struct file_lock *fl = file_lock(flc); + + pid = locks_translate_pid(flc, proc_pidns); - fl_pid = locks_translate_pid(fl, proc_pidns); /* * If lock owner is dead (and pid is freed) or not visible in current * pidns, zero is shown as a pid value. Check lock info from * init_pid_ns to get saved lock pid value. */ - - if (fl->fl_file != NULL) - inode = file_inode(fl->fl_file); + if (flc->flc_file != NULL) + inode = file_inode(flc->flc_file); seq_printf(f, "%lld: ", id); if (repeat) seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx); - if (IS_POSIX(fl)) { - if (fl->fl_flags & FL_ACCESS) + if (flc->flc_flags & FL_POSIX) { + if (flc->flc_flags & FL_ACCESS) seq_puts(f, "ACCESS"); - else if (IS_OFDLCK(fl)) + else if (flc->flc_flags & FL_OFDLCK) seq_puts(f, "OFDLCK"); else seq_puts(f, "POSIX "); seq_printf(f, " %s ", (inode == NULL) ? "*NOINODE*" : "ADVISORY "); - } else if (IS_FLOCK(fl)) { + } else if (flc->flc_flags & FL_FLOCK) { seq_puts(f, "FLOCK ADVISORY "); - } else if (IS_LEASE(fl)) { - if (fl->fl_flags & FL_DELEG) + } else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) { + struct file_lease *lease = file_lease(flc); + + type = target_leasetype(lease); + + if (flc->flc_flags & FL_DELEG) seq_puts(f, "DELEG "); else seq_puts(f, "LEASE "); - if (lease_breaking(fl)) + if (lease_breaking(lease)) seq_puts(f, "BREAKING "); - else if (fl->fl_file) + else if (flc->flc_file) seq_puts(f, "ACTIVE "); else seq_puts(f, "BREAKER "); } else { seq_puts(f, "UNKNOWN UNKNOWN "); } - type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : (type == F_RDLCK) ? "READ" : "UNLCK"); if (inode) { /* userspace relies on this representation of dev_t */ - seq_printf(f, "%d %02x:%02x:%lu ", fl_pid, + seq_printf(f, "%d %02x:%02x:%lu ", pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } else { - seq_printf(f, "%d <none>:0 ", fl_pid); + seq_printf(f, "%d <none>:0 ", pid); } - if (IS_POSIX(fl)) { + if (flc->flc_flags & FL_POSIX) { if (fl->fl_end == OFFSET_MAX) seq_printf(f, "%Ld EOF\n", fl->fl_start); else @@ -2763,17 +2833,18 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } } -static struct file_lock *get_next_blocked_member(struct file_lock *node) +static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node) { - struct file_lock *tmp; + struct file_lock_core *tmp; /* NULL node or root node */ - if (node == NULL || node->fl_blocker == NULL) + if (node == NULL || node->flc_blocker == NULL) return NULL; /* Next member in the linked list could be itself */ - tmp = list_next_entry(node, fl_blocked_member); - if (list_entry_is_head(tmp, &node->fl_blocker->fl_blocked_requests, fl_blocked_member) + tmp = list_next_entry(node, flc_blocked_member); + if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests, + flc_blocked_member) || tmp == node) { return NULL; } @@ -2784,18 +2855,18 @@ static struct file_lock *get_next_blocked_member(struct file_lock *node) static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; - struct file_lock *cur, *tmp; + struct file_lock_core *cur, *tmp; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); int level = 0; - cur = hlist_entry(v, struct file_lock, fl_link); + cur = hlist_entry(v, struct file_lock_core, flc_link); if (locks_translate_pid(cur, proc_pidns) == 0) return 0; - /* View this crossed linked list as a binary tree, the first member of fl_blocked_requests - * is the left child of current node, the next silibing in fl_blocked_member is the - * right child, we can alse get the parent of current node from fl_blocker, so this + /* View this crossed linked list as a binary tree, the first member of flc_blocked_requests + * is the left child of current node, the next silibing in flc_blocked_member is the + * right child, we can alse get the parent of current node from flc_blocker, so this * question becomes traversal of a binary tree */ while (cur != NULL) { @@ -2804,17 +2875,18 @@ static int locks_show(struct seq_file *f, void *v) else lock_get_status(f, cur, iter->li_pos, "", level); - if (!list_empty(&cur->fl_blocked_requests)) { + if (!list_empty(&cur->flc_blocked_requests)) { /* Turn left */ - cur = list_first_entry_or_null(&cur->fl_blocked_requests, - struct file_lock, fl_blocked_member); + cur = list_first_entry_or_null(&cur->flc_blocked_requests, + struct file_lock_core, + flc_blocked_member); level++; } else { /* Turn right */ tmp = get_next_blocked_member(cur); /* Fall back to parent node */ - while (tmp == NULL && cur->fl_blocker != NULL) { - cur = cur->fl_blocker; + while (tmp == NULL && cur->flc_blocker != NULL) { + cur = cur->flc_blocker; level--; tmp = get_next_blocked_member(cur); } @@ -2829,14 +2901,13 @@ static void __show_fd_locks(struct seq_file *f, struct list_head *head, int *id, struct file *filp, struct files_struct *files) { - struct file_lock *fl; + struct file_lock_core *fl; - list_for_each_entry(fl, head, fl_list) { + list_for_each_entry(fl, head, flc_list) { - if (filp != fl->fl_file) + if (filp != fl->flc_file) continue; - if (fl->fl_owner != files && - fl->fl_owner != filp) + if (fl->flc_owner != files && fl->flc_owner != filp) continue; (*id)++; @@ -2915,6 +2986,9 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); + filelease_cache = kmem_cache_create("file_lock_cache", + sizeof(struct file_lease), 0, SLAB_PANIC, NULL); + for_each_possible_cpu(i) { struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); diff --git a/fs/mbcache.c b/fs/mbcache.c index 82aa7a35db26..e60a840999aa 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -426,9 +426,7 @@ EXPORT_SYMBOL(mb_cache_destroy); static int __init mbcache_init(void) { - mb_entry_cache = kmem_cache_create("mbcache", - sizeof(struct mb_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT); if (!mb_entry_cache) return -ENOMEM; return 0; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 73f37f298087..7cbd2b9f4d11 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -87,7 +87,7 @@ static int __init init_inodecache(void) minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (minix_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 64c5205e2b5e..3c60f1eaca61 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -214,7 +214,7 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from, * anything at all. */ if (nr_extents == 0) - return 0; + return -EINVAL; /* * Here we know that nr_extents is greater than zero which means diff --git a/fs/mpage.c b/fs/mpage.c index 738882e0766d..fa8b99a199fa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -605,6 +605,7 @@ alloc_new: GFP_NOFS); bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/fs/namei.c b/fs/namei.c index 4e0de939fea1..d0c4a3e9278e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1717,7 +1717,11 @@ static inline int may_lookup(struct mnt_idmap *idmap, { if (nd->flags & LOOKUP_RCU) { int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (err != -ECHILD || !try_to_unlazy(nd)) + if (!err) // success, keep going + return 0; + if (!try_to_unlazy(nd)) + return -ECHILD; // redo it all non-lazy + if (err != -ECHILD) // hard error return err; } return inode_permission(idmap, nd->inode, MAY_EXEC); @@ -2676,10 +2680,8 @@ static int lookup_one_common(struct mnt_idmap *idmap, if (!len) return -EACCES; - if (unlikely(name[0] == '.')) { - if (len < 2 || (len == 2 && name[1] == '.')) - return -EACCES; - } + if (is_dot_dotdot(name, len)) + return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; diff --git a/fs/namespace.c b/fs/namespace.c index ef1fd6829814..5a51315c6678 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4472,10 +4472,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) /* * If this is an attached mount make sure it's located in the callers * mount namespace. If it's not don't let the caller interact with it. - * If this is a detached mount make sure it has an anonymous mount - * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE. + * + * If this mount doesn't have a parent it's most often simply a + * detached mount with an anonymous mount namespace. IOW, something + * that's simply not attached yet. But there are apparently also users + * that do change mount properties on the rootfs itself. That obviously + * neither has a parent nor is it a detached mount so we cannot + * unconditionally check for detached mounts. */ - if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns))) + if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) goto out; /* @@ -5042,13 +5047,12 @@ static struct mount *listmnt_next(struct mount *curr) return node_to_mount(rb_next(&curr->mnt_node)); } -static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id, - u64 __user *buf, size_t bufsize, - const struct path *root) +static ssize_t do_listmount(struct mount *first, struct path *orig, + u64 mnt_parent_id, u64 __user *mnt_ids, + size_t nr_mnt_ids, const struct path *root) { struct mount *r; - ssize_t ctr; - int err; + ssize_t ret; /* * Don't trigger audit denials. We just want to determine what @@ -5058,50 +5062,57 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id, !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) return -EPERM; - err = security_sb_statfs(orig->dentry); - if (err) - return err; + ret = security_sb_statfs(orig->dentry); + if (ret) + return ret; - for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) { - if (r->mnt_id_unique == mnt_id) + for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) { + if (r->mnt_id_unique == mnt_parent_id) continue; if (!is_path_reachable(r, r->mnt.mnt_root, orig)) continue; - ctr = array_index_nospec(ctr, bufsize); - if (put_user(r->mnt_id_unique, buf + ctr)) + if (put_user(r->mnt_id_unique, mnt_ids)) return -EFAULT; - if (check_add_overflow(ctr, 1, &ctr)) - return -ERANGE; + mnt_ids++; + nr_mnt_ids--; + ret++; } - return ctr; + return ret; } -SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, - u64 __user *, buf, size_t, bufsize, unsigned int, flags) +SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, + mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mnt_id_req kreq; struct mount *first; struct path root, orig; - u64 mnt_id, last_mnt_id; + u64 mnt_parent_id, last_mnt_id; + const size_t maxcount = (size_t)-1 >> 3; ssize_t ret; if (flags) return -EINVAL; + if (unlikely(nr_mnt_ids > maxcount)) + return -EFAULT; + + if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) + return -EFAULT; + ret = copy_mnt_id_req(req, &kreq); if (ret) return ret; - mnt_id = kreq.mnt_id; + mnt_parent_id = kreq.mnt_id; last_mnt_id = kreq.param; down_read(&namespace_sem); get_fs_root(current->fs, &root); - if (mnt_id == LSMT_ROOT) { + if (mnt_parent_id == LSMT_ROOT) { orig = root; } else { ret = -ENOENT; - orig.mnt = lookup_mnt_in_ns(mnt_id, ns); + orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) goto err; orig.dentry = orig.mnt->mnt_root; @@ -5111,7 +5122,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, else first = mnt_find_id_at(ns, last_mnt_id + 1); - ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root); + ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root); err: path_put(&root); up_read(&namespace_sem); diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index b4db21022cb4..bec805e0c44c 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -21,3 +21,42 @@ config NETFS_STATS multi-CPU system these may be on cachelines that keep bouncing between CPUs. On the other hand, the stats are very useful for debugging purposes. Saying 'Y' here is recommended. + +config FSCACHE + bool "General filesystem local caching manager" + depends on NETFS_SUPPORT + help + This option enables a generic filesystem caching manager that can be + used by various network and other filesystems to cache data locally. + Different sorts of caches can be plugged in, depending on the + resources available. + + See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_STATS + bool "Gather statistical information on local caching" + depends on FSCACHE && PROC_FS + select NETFS_STATS + help + This option causes statistical information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/stats + + The gathering of statistics adds a certain amount of overhead to + execution as there are a quite a few stats gathered, and on a + multi-CPU system these may be on cachelines that keep bouncing + between CPUs. On the other hand, the stats are very useful for + debugging purposes. Saying 'Y' here is recommended. + + See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_DEBUG + bool "Debug FS-Cache" + depends on FSCACHE + help + This permits debugging to be dynamically enabled in the local caching + management module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/fscache/parameter/debug. + + See Documentation/filesystems/caching/fscache.rst for more information. diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index 386d6fb92793..d4d1d799819e 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -2,11 +2,29 @@ netfs-y := \ buffered_read.o \ + buffered_write.o \ + direct_read.o \ + direct_write.o \ io.o \ iterator.o \ + locking.o \ main.o \ - objects.o + misc.o \ + objects.o \ + output.o netfs-$(CONFIG_NETFS_STATS) += stats.o -obj-$(CONFIG_NETFS_SUPPORT) := netfs.o +netfs-$(CONFIG_FSCACHE) += \ + fscache_cache.o \ + fscache_cookie.o \ + fscache_io.o \ + fscache_main.o \ + fscache_volume.o + +ifeq ($(CONFIG_PROC_FS),y) +netfs-$(CONFIG_FSCACHE) += fscache_proc.o +endif +netfs-$(CONFIG_FSCACHE_STATS) += fscache_stats.o + +obj-$(CONFIG_NETFS_SUPPORT) += netfs.o diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 2cd3ccf4c439..3298c29b5548 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -16,6 +16,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; + struct netfs_folio *finfo; struct folio *folio; pgoff_t start_page = rreq->start / PAGE_SIZE; pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; @@ -63,6 +64,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) break; } if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); folio_start_fscache(folio); folio_started = true; } @@ -86,11 +88,20 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (!pg_failed) { flush_dcache_folio(folio); + finfo = netfs_folio_info(folio); + if (finfo) { + trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + kfree(finfo); + } folio_mark_uptodate(folio); } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio_index(folio) == rreq->no_unlock_folio && + if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) _debug("no unlock"); else @@ -147,6 +158,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq, } } +/* + * Begin an operation, and fetch the stored zero point value from the cookie if + * available. + */ +static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx) +{ + return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); +} + /** * netfs_readahead - Helper to manage a read request * @ractl: The description of the readahead request @@ -180,11 +200,9 @@ void netfs_readahead(struct readahead_control *ractl) if (IS_ERR(rreq)) return; - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto cleanup_free; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto cleanup_free; netfs_stat(&netfs_n_rh_readahead); trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), @@ -192,6 +210,10 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages, + rreq->start, rreq->len); + /* Drop the refs on the folios here rather than in the cache or * filesystem. The locks will be dropped in netfs_rreq_unlock(). */ @@ -199,6 +221,7 @@ void netfs_readahead(struct readahead_control *ractl) ; netfs_begin_read(rreq, false); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); return; cleanup_free: @@ -223,12 +246,13 @@ EXPORT_SYMBOL(netfs_readahead); */ int netfs_read_folio(struct file *file, struct folio *folio) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct netfs_io_request *rreq; struct netfs_inode *ctx = netfs_inode(mapping->host); + struct folio *sink = NULL; int ret; - _enter("%lx", folio_index(folio)); + _enter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_file_pos(folio), folio_size(folio), @@ -238,15 +262,64 @@ int netfs_read_folio(struct file *file, struct folio *folio) goto alloc_error; } - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto discard; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto discard; netfs_stat(&netfs_n_rh_readpage); trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); - return netfs_begin_read(rreq, true); + + /* Set up the output buffer */ + if (folio_test_dirty(folio)) { + /* Handle someone trying to read from an unflushed streaming + * write. We fiddle the buffer so that a gap at the beginning + * and/or a gap at the end get copied to, but the middle is + * discarded. + */ + struct netfs_folio *finfo = netfs_folio_info(folio); + struct bio_vec *bvec; + unsigned int from = finfo->dirty_offset; + unsigned int to = from + finfo->dirty_len; + unsigned int off = 0, i = 0; + size_t flen = folio_size(folio); + size_t nr_bvec = flen / PAGE_SIZE + 2; + size_t part; + + ret = -ENOMEM; + bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + goto discard; + + sink = folio_alloc(GFP_KERNEL, 0); + if (!sink) + goto discard; + + trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + + rreq->direct_bv = bvec; + rreq->direct_bv_count = nr_bvec; + if (from > 0) { + bvec_set_folio(&bvec[i++], folio, from, 0); + off = from; + } + while (off < to) { + part = min_t(size_t, to - off, PAGE_SIZE); + bvec_set_folio(&bvec[i++], sink, part, 0); + off += part; + } + if (to < flen) + bvec_set_folio(&bvec[i++], folio, flen - to, to); + iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); + } else { + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + } + + ret = netfs_begin_read(rreq, true); + if (sink) + folio_put(sink); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret < 0 ? ret : 0; discard: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); @@ -387,14 +460,12 @@ retry: ret = PTR_ERR(rreq); goto error; } - rreq->no_unlock_folio = folio_index(folio); + rreq->no_unlock_folio = folio->index; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto error_put; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; netfs_stat(&netfs_n_rh_write_begin); trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); @@ -405,6 +476,10 @@ retry: ractl._nr_pages = folio_nr_pages(folio); netfs_rreq_expand(rreq, &ractl); + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + /* We hold the folio locks, so we can drop the references */ folio_get(folio); while (readahead_folio(&ractl)) @@ -413,6 +488,7 @@ retry: ret = netfs_begin_read(rreq, true); if (ret < 0) goto error; + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: ret = folio_wait_fscache_killable(folio); @@ -434,3 +510,124 @@ error: return ret; } EXPORT_SYMBOL(netfs_write_begin); + +/* + * Preload the data into a page we're proposing to write into. + */ +int netfs_prefetch_for_write(struct file *file, struct folio *folio, + size_t offset, size_t len) +{ + struct netfs_io_request *rreq; + struct address_space *mapping = folio->mapping; + struct netfs_inode *ctx = netfs_inode(mapping->host); + unsigned long long start = folio_pos(folio); + size_t flen = folio_size(folio); + int ret; + + _enter("%zx @%llx", flen, start); + + ret = -ENOMEM; + + rreq = netfs_alloc_request(mapping, file, start, flen, + NETFS_READ_FOR_WRITE); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto error; + } + + rreq->no_unlock_folio = folio->index; + __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; + + netfs_stat(&netfs_n_rh_write_begin); + trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); + + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + + ret = netfs_begin_read(rreq, true); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret; + +error_put: + netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); +error: + _leave(" = %d", ret); + return ret; +} + +/** + * netfs_buffered_read_iter - Filesystem buffered I/O read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the ->read_iter() routine for all filesystems that can use the page + * cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be + * returned when no data can be read without waiting for I/O requests to + * complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests + * shall be made for the read or for readahead. When no data can be read, + * -EAGAIN shall be returned. When readahead would be triggered, a partial, + * possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ +ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))) + return -EINVAL; + + ret = netfs_start_io_read(inode); + if (ret == 0) { + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + } + return ret; +} +EXPORT_SYMBOL(netfs_buffered_read_iter); + +/** + * netfs_file_read_iter - Generic filesystem read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the ->read_iter() routine for all filesystems that can use the page + * cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be + * returned when no data can be read without waiting for I/O requests to + * complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests + * shall be made for the read or for readahead. When no data can be read, + * -EAGAIN shall be returned. When readahead would be triggered, a partial, + * possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ +ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host); + + if ((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) + return netfs_unbuffered_read_iter(iocb, iter); + + return netfs_buffered_read_iter(iocb, iter); +} +EXPORT_SYMBOL(netfs_file_read_iter); diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c new file mode 100644 index 000000000000..9a0d32e4b422 --- /dev/null +++ b/fs/netfs/buffered_write.c @@ -0,0 +1,1257 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/pagevec.h> +#include "internal.h" + +/* + * Determined write method. Adjust netfs_folio_traces if this is changed. + */ +enum netfs_how_to_modify { + NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ + NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ + NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ + NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ + NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ + NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ + NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ +}; + +static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq); + +static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) +{ + if (netfs_group && !folio_get_private(folio)) + folio_attach_private(folio, netfs_get_group(netfs_group)); +} + +#if IS_ENABLED(CONFIG_FSCACHE) +static void netfs_folio_start_fscache(bool caching, struct folio *folio) +{ + if (caching) + folio_start_fscache(folio); +} +#else +static void netfs_folio_start_fscache(bool caching, struct folio *folio) +{ +} +#endif + +/* + * Decide how we should modify a folio. We might be attempting to do + * write-streaming, in which case we don't want to a local RMW cycle if we can + * avoid it. If we're doing local caching or content crypto, we award that + * priority over avoiding RMW. If the file is open readably, then we also + * assume that we may want to read what we wrote. + */ +static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, + struct file *file, + struct folio *folio, + void *netfs_group, + size_t flen, + size_t offset, + size_t len, + bool maybe_trouble) +{ + struct netfs_folio *finfo = netfs_folio_info(folio); + loff_t pos = folio_file_pos(folio); + + _enter(""); + + if (netfs_folio_group(folio) != netfs_group) + return NETFS_FLUSH_CONTENT; + + if (folio_test_uptodate(folio)) + return NETFS_FOLIO_IS_UPTODATE; + + if (pos >= ctx->zero_point) + return NETFS_MODIFY_AND_CLEAR; + + if (!maybe_trouble && offset == 0 && len >= flen) + return NETFS_WHOLE_FOLIO_MODIFY; + + if (file->f_mode & FMODE_READ) + goto no_write_streaming; + if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) + goto no_write_streaming; + + if (netfs_is_cache_enabled(ctx)) { + /* We don't want to get a streaming write on a file that loses + * caching service temporarily because the backing store got + * culled. + */ + if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) + set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags); + goto no_write_streaming; + } + + if (!finfo) + return NETFS_STREAMING_WRITE; + + /* We can continue a streaming write only if it continues on from the + * previous. If it overlaps, we must flush lest we suffer a partial + * copy and disjoint dirty regions. + */ + if (offset == finfo->dirty_offset + finfo->dirty_len) + return NETFS_STREAMING_WRITE_CONT; + return NETFS_FLUSH_CONTENT; + +no_write_streaming: + if (finfo) { + netfs_stat(&netfs_n_wh_wstream_conflict); + return NETFS_FLUSH_CONTENT; + } + return NETFS_JUST_PREFETCH; +} + +/* + * Grab a folio for writing and lock it. Attempt to allocate as large a folio + * as possible to hold as much of the remaining length as possible in one go. + */ +static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, + loff_t pos, size_t part) +{ + pgoff_t index = pos / PAGE_SIZE; + fgf_t fgp_flags = FGP_WRITEBEGIN; + + if (mapping_large_folio_support(mapping)) + fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); + + return __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} + +/** + * netfs_perform_write - Copy data into the pagecache. + * @iocb: The operation parameters + * @iter: The source buffer + * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * + * Copy data into pagecache pages attached to the inode specified by @iocb. + * The caller must hold appropriate inode locks. + * + * Dirty pages are tagged with a netfs_folio struct if they're not up to date + * to indicate the range modified. Dirty pages may also be tagged with a + * netfs-specific grouping such that data from an old group gets flushed before + * a new one is started. + */ +ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct netfs_inode *ctx = netfs_inode(inode); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .for_sync = true, + .nr_to_write = LONG_MAX, + .range_start = iocb->ki_pos, + .range_end = iocb->ki_pos + iter->count, + }; + struct netfs_io_request *wreq = NULL; + struct netfs_folio *finfo; + struct folio *folio; + enum netfs_how_to_modify howto; + enum netfs_folio_trace trace; + unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; + ssize_t written = 0, ret; + loff_t i_size, pos = iocb->ki_pos, from, to; + size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; + bool maybe_trouble = false; + + if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || + iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) + ) { + if (pos < i_size_read(inode)) { + ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); + if (ret < 0) { + goto out; + } + } + + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + + wreq = netfs_begin_writethrough(iocb, iter->count); + if (IS_ERR(wreq)) { + wbc_detach_inode(&wbc); + ret = PTR_ERR(wreq); + wreq = NULL; + goto out; + } + if (!is_sync_kiocb(iocb)) + wreq->iocb = iocb; + wreq->cleanup = netfs_cleanup_buffered_write; + } + + do { + size_t flen; + size_t offset; /* Offset into pagecache folio */ + size_t part; /* Bytes to write to folio */ + size_t copied; /* Bytes copied from user */ + + ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); + if (unlikely(ret < 0)) + break; + + offset = pos & (max_chunk - 1); + part = min(max_chunk - offset, iov_iter_count(iter)); + + /* Bring in the user pages that we will copy from _first_ lest + * we hit a nasty deadlock on copying from the same page as + * we're writing to, without it being marked uptodate. + * + * Not only is this an optimisation, but it is also required to + * check that the address is actually valid, when atomic + * usercopies are used below. + * + * We rely on the page being held onto long enough by the LRU + * that we can grab it below if this causes it to be read. + */ + ret = -EFAULT; + if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) + break; + + folio = netfs_grab_folio_for_write(mapping, pos, part); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + break; + } + + flen = folio_size(folio); + offset = pos & (flen - 1); + part = min_t(size_t, flen - offset, part); + + if (signal_pending(current)) { + ret = written ? -EINTR : -ERESTARTSYS; + goto error_folio_unlock; + } + + /* See if we need to prefetch the area we're going to modify. + * We need to do this before we get a lock on the folio in case + * there's more than one writer competing for the same cache + * block. + */ + howto = netfs_how_to_modify(ctx, file, folio, netfs_group, + flen, offset, part, maybe_trouble); + _debug("howto %u", howto); + switch (howto) { + case NETFS_JUST_PREFETCH: + ret = netfs_prefetch_for_write(file, folio, offset, part); + if (ret < 0) { + _debug("prefetch = %zd", ret); + goto error_folio_unlock; + } + break; + case NETFS_FOLIO_IS_UPTODATE: + case NETFS_WHOLE_FOLIO_MODIFY: + case NETFS_STREAMING_WRITE_CONT: + break; + case NETFS_MODIFY_AND_CLEAR: + zero_user_segment(&folio->page, 0, offset); + break; + case NETFS_STREAMING_WRITE: + ret = -EIO; + if (WARN_ON(folio_get_private(folio))) + goto error_folio_unlock; + break; + case NETFS_FLUSH_CONTENT: + trace_netfs_folio(folio, netfs_flush_content); + from = folio_pos(folio); + to = from + folio_size(folio) - 1; + folio_unlock(folio); + folio_put(folio); + ret = filemap_write_and_wait_range(mapping, from, to); + if (ret < 0) + goto error_folio_unlock; + continue; + } + + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + + flush_dcache_folio(folio); + + /* Deal with a (partially) failed copy */ + if (copied == 0) { + ret = -EFAULT; + goto error_folio_unlock; + } + + trace = (enum netfs_folio_trace)howto; + switch (howto) { + case NETFS_FOLIO_IS_UPTODATE: + case NETFS_JUST_PREFETCH: + netfs_set_group(folio, netfs_group); + break; + case NETFS_MODIFY_AND_CLEAR: + zero_user_segment(&folio->page, offset + copied, flen); + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + break; + case NETFS_WHOLE_FOLIO_MODIFY: + if (unlikely(copied < part)) { + maybe_trouble = true; + iov_iter_revert(iter, copied); + copied = 0; + goto retry; + } + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + break; + case NETFS_STREAMING_WRITE: + if (offset == 0 && copied == flen) { + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + trace = netfs_streaming_filled_page; + break; + } + finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); + if (!finfo) { + iov_iter_revert(iter, copied); + ret = -ENOMEM; + goto error_folio_unlock; + } + finfo->netfs_group = netfs_get_group(netfs_group); + finfo->dirty_offset = offset; + finfo->dirty_len = copied; + folio_attach_private(folio, (void *)((unsigned long)finfo | + NETFS_FOLIO_INFO)); + break; + case NETFS_STREAMING_WRITE_CONT: + finfo = netfs_folio_info(folio); + finfo->dirty_len += copied; + if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + folio_mark_uptodate(folio); + kfree(finfo); + trace = netfs_streaming_cont_filled_page; + } + break; + default: + WARN(true, "Unexpected modify type %u ix=%lx\n", + howto, folio->index); + ret = -EIO; + goto error_folio_unlock; + } + + trace_netfs_folio(folio, trace); + + /* Update the inode size if we moved the EOF marker */ + i_size = i_size_read(inode); + pos += copied; + if (pos > i_size) { + if (ctx->ops->update_i_size) { + ctx->ops->update_i_size(inode, pos); + } else { + i_size_write(inode, pos); +#if IS_ENABLED(CONFIG_FSCACHE) + fscache_update_cookie(ctx->cache, NULL, &pos); +#endif + } + } + written += copied; + + if (likely(!wreq)) { + folio_mark_dirty(folio); + } else { + if (folio_test_dirty(folio)) + /* Sigh. mmap. */ + folio_clear_dirty_for_io(folio); + /* We make multiple writes to the folio... */ + if (!folio_test_writeback(folio)) { + folio_wait_fscache(folio); + folio_start_writeback(folio); + folio_start_fscache(folio); + if (wreq->iter.count == 0) + trace_netfs_folio(folio, netfs_folio_trace_wthru); + else + trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); + } + netfs_advance_writethrough(wreq, copied, + offset + copied == flen); + } + retry: + folio_unlock(folio); + folio_put(folio); + folio = NULL; + + cond_resched(); + } while (iov_iter_count(iter)); + +out: + if (unlikely(wreq)) { + ret = netfs_end_writethrough(wreq, iocb); + wbc_detach_inode(&wbc); + if (ret == -EIOCBQUEUED) + return ret; + } + + iocb->ki_pos += written; + _leave(" = %zd [%zd]", written, ret); + return written ? written : ret; + +error_folio_unlock: + folio_unlock(folio); + folio_put(folio); + goto out; +} +EXPORT_SYMBOL(netfs_perform_write); + +/** + * netfs_buffered_write_iter_locked - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @from: iov_iter with data to write + * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * The caller must hold appropriate locks around this function and have called + * generic_write_checks() already. The caller is also responsible for doing + * any necessary syncing afterwards. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_rwsem. + * + * Return: + * * number of bytes written, even for truncated writes + * * negative error code if no data has been written at all + */ +ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, + struct netfs_group *netfs_group) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + trace_netfs_write_iter(iocb, from); + + ret = file_remove_privs(file); + if (ret) + return ret; + + ret = file_update_time(file); + if (ret) + return ret; + + return netfs_perform_write(iocb, from, netfs_group); +} +EXPORT_SYMBOL(netfs_buffered_write_iter_locked); + +/** + * netfs_file_write_iter - write data to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Perform a write to a file, writing into the pagecache if possible and doing + * an unbuffered write instead if not. + * + * Return: + * * Negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * Number of bytes written, even for truncated writes + */ +ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + + if (!iov_iter_count(from)) + return 0; + + if ((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) + return netfs_unbuffered_write_iter(iocb, from); + + ret = netfs_start_io_write(inode); + if (ret < 0) + return ret; + + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = netfs_buffered_write_iter_locked(iocb, from, NULL); + netfs_end_io_write(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +EXPORT_SYMBOL(netfs_file_write_iter); + +/* + * Notification that a previously read-only page is about to become writable. + * Note that the caller indicates a single page of a multipage folio. + */ +vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) +{ + struct folio *folio = page_folio(vmf->page); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + vm_fault_t ret = VM_FAULT_RETRY; + int err; + + _enter("%lx", folio->index); + + sb_start_pagefault(inode->i_sb); + + if (folio_wait_writeback_killable(folio)) + goto out; + + if (folio_lock_killable(folio) < 0) + goto out; + + /* Can we see a streaming write here? */ + if (WARN_ON(!folio_test_uptodate(folio))) { + ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; + goto out; + } + + if (netfs_folio_group(folio) != netfs_group) { + folio_unlock(folio); + err = filemap_fdatawait_range(inode->i_mapping, + folio_pos(folio), + folio_pos(folio) + folio_size(folio)); + switch (err) { + case 0: + ret = VM_FAULT_RETRY; + goto out; + case -ENOMEM: + ret = VM_FAULT_OOM; + goto out; + default: + ret = VM_FAULT_SIGBUS; + goto out; + } + } + + if (folio_test_dirty(folio)) + trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); + else + trace_netfs_folio(folio, netfs_folio_trace_mkwrite); + netfs_set_group(folio, netfs_group); + file_update_time(file); + ret = VM_FAULT_LOCKED; +out: + sb_end_pagefault(inode->i_sb); + return ret; +} +EXPORT_SYMBOL(netfs_page_mkwrite); + +/* + * Kill all the pages in the given range + */ +static void netfs_kill_pages(struct address_space *mapping, + loff_t start, loff_t len) +{ + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; + + _enter("%llx-%llx", start, start + len - 1); + + do { + _debug("kill %lx (to %lx)", index, last); + + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) { + next = index + 1; + continue; + } + + next = folio_next_index(folio); + + trace_netfs_folio(folio, netfs_folio_trace_kill); + folio_clear_uptodate(folio); + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + folio_end_writeback(folio); + folio_lock(folio); + generic_error_remove_folio(mapping, folio); + folio_unlock(folio); + folio_put(folio); + + } while (index = next, index <= last); + + _leave(""); +} + +/* + * Redirty all the pages in a given range. + */ +static void netfs_redirty_pages(struct address_space *mapping, + loff_t start, loff_t len) +{ + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; + + _enter("%llx-%llx", start, start + len - 1); + + do { + _debug("redirty %llx @%llx", len, start); + + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) { + next = index + 1; + continue; + } + + next = folio_next_index(folio); + trace_netfs_folio(folio, netfs_folio_trace_redirty); + filemap_dirty_folio(mapping, folio); + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + folio_end_writeback(folio); + folio_put(folio); + } while (index = next, index <= last); + + balance_dirty_pages_ratelimited(mapping); + + _leave(""); +} + +/* + * Completion of write to server + */ +static void netfs_pages_written_back(struct netfs_io_request *wreq) +{ + struct address_space *mapping = wreq->mapping; + struct netfs_folio *finfo; + struct netfs_group *group = NULL; + struct folio *folio; + pgoff_t last; + int gcount = 0; + + XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE); + + _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); + + rcu_read_lock(); + + last = (wreq->start + wreq->len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, last) { + WARN(!folio_test_writeback(folio), + "bad %zx @%llx page %lx %lx\n", + wreq->len, wreq->start, folio->index, last); + + if ((finfo = netfs_folio_info(folio))) { + /* Streaming writes cannot be redirtied whilst under + * writeback, so discard the streaming record. + */ + folio_detach_private(folio); + group = finfo->netfs_group; + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_s); + kfree(finfo); + } else if ((group = netfs_folio_group(folio))) { + /* Need to detach the group pointer if the page didn't + * get redirtied. If it has been redirtied, then it + * must be within the same group. + */ + if (folio_test_dirty(folio)) { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + goto end_wb; + } + if (folio_trylock(folio)) { + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_g); + } else { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + } + folio_unlock(folio); + goto end_wb; + } + + xas_pause(&xas); + rcu_read_unlock(); + folio_lock(folio); + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_g); + } else { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + } + folio_unlock(folio); + rcu_read_lock(); + } else { + trace_netfs_folio(folio, netfs_folio_trace_clear); + } + end_wb: + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + xas_advance(&xas, folio_next_index(folio) - 1); + folio_end_writeback(folio); + } + + rcu_read_unlock(); + netfs_put_group_many(group, gcount); + _leave(""); +} + +/* + * Deal with the disposition of the folios that are under writeback to close + * out the operation. + */ +static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq) +{ + struct address_space *mapping = wreq->mapping; + + _enter(""); + + switch (wreq->error) { + case 0: + netfs_pages_written_back(wreq); + break; + + default: + pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error); + fallthrough; + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + case -ENETRESET: + case -EDQUOT: + case -ENOSPC: + netfs_redirty_pages(mapping, wreq->start, wreq->len); + break; + + case -EROFS: + case -EIO: + case -EREMOTEIO: + case -EFBIG: + case -ENOENT: + case -ENOMEDIUM: + case -ENXIO: + netfs_kill_pages(mapping, wreq->start, wreq->len); + break; + } + + if (wreq->error) + mapping_set_error(mapping, wreq->error); + if (wreq->netfs_ops->done) + wreq->netfs_ops->done(wreq); +} + +/* + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + * + * If this page holds new content, then we can include filler zeros in the + * writeback. + */ +static void netfs_extend_writeback(struct address_space *mapping, + struct netfs_group *group, + struct xa_state *xas, + long *_count, + loff_t start, + loff_t max_len, + bool caching, + size_t *_len, + size_t *_top) +{ + struct netfs_folio *finfo; + struct folio_batch fbatch; + struct folio *folio; + unsigned int i; + pgoff_t index = (start + *_len) / PAGE_SIZE; + size_t len; + void *priv; + bool stop = true; + + folio_batch_init(&fbatch); + + do { + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. + */ + rcu_read_lock(); + + xas_for_each(xas, folio, ULONG_MAX) { + stop = true; + if (xas_retry(xas, folio)) + continue; + if (xa_is_value(folio)) + break; + if (folio->index != index) { + xas_reset(xas); + break; + } + + if (!folio_try_get_rcu(folio)) { + xas_reset(xas); + continue; + } + + /* Has the folio moved or been split? */ + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); + xas_reset(xas); + break; + } + + if (!folio_trylock(folio)) { + folio_put(folio); + xas_reset(xas); + break; + } + if (!folio_test_dirty(folio) || + folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + folio_put(folio); + xas_reset(xas); + break; + } + + stop = false; + len = folio_size(folio); + priv = folio_get_private(folio); + if ((const struct netfs_group *)priv != group) { + stop = true; + finfo = netfs_folio_info(folio); + if (finfo->netfs_group != group || + finfo->dirty_offset > 0) { + folio_unlock(folio); + folio_put(folio); + xas_reset(xas); + break; + } + len = finfo->dirty_len; + } + + *_top += folio_size(folio); + index += folio_nr_pages(folio); + *_count -= folio_nr_pages(folio); + *_len += len; + if (*_len >= max_len || *_count <= 0) + stop = true; + + if (!folio_batch_add(&fbatch, folio)) + break; + if (stop) + break; + } + + xas_pause(xas); + rcu_read_unlock(); + + /* Now, if we obtained any folios, we can shift them to being + * writable and mark them for caching. + */ + if (!folio_batch_count(&fbatch)) + break; + + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; + trace_netfs_folio(folio, netfs_folio_trace_store_plus); + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + folio_start_writeback(folio); + netfs_folio_start_fscache(caching, folio); + folio_unlock(folio); + } + + folio_batch_release(&fbatch); + cond_resched(); + } while (!stop); +} + +/* + * Synchronously write back the locked page and any subsequent non-locked dirty + * pages. + */ +static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + struct xa_state *xas, + struct folio *folio, + unsigned long long start, + unsigned long long end) +{ + struct netfs_io_request *wreq; + struct netfs_folio *finfo; + struct netfs_inode *ctx = netfs_inode(mapping->host); + unsigned long long i_size = i_size_read(&ctx->inode); + size_t len, max_len; + bool caching = netfs_is_cache_enabled(ctx); + long count = wbc->nr_to_write; + int ret; + + _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching); + + wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), + NETFS_WRITEBACK); + if (IS_ERR(wreq)) { + folio_unlock(folio); + return PTR_ERR(wreq); + } + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + folio_start_writeback(folio); + netfs_folio_start_fscache(caching, folio); + + count -= folio_nr_pages(folio); + + /* Find all consecutive lockable dirty pages that have contiguous + * written regions, stopping when we find a page that is not + * immediately lockable, is not dirty or is missing, or we reach the + * end of the range. + */ + trace_netfs_folio(folio, netfs_folio_trace_store); + + len = wreq->len; + finfo = netfs_folio_info(folio); + if (finfo) { + start += finfo->dirty_offset; + if (finfo->dirty_offset + finfo->dirty_len != len) { + len = finfo->dirty_len; + goto cant_expand; + } + len = finfo->dirty_len; + } + + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = 65536 * 4096; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); + + if (len < max_len) + netfs_extend_writeback(mapping, group, xas, &count, start, + max_len, caching, &len, &wreq->upper_len); + } + +cant_expand: + len = min_t(unsigned long long, len, i_size - start); + + /* We now have a contiguous set of dirty pages, each with writeback + * set; the first page is still locked at this point, but all the rest + * have been unlocked. + */ + folio_unlock(folio); + wreq->start = start; + wreq->len = len; + + if (start < i_size) { + _debug("write back %zx @%llx [%llx]", len, start, i_size); + + /* Speculatively write to the cache. We have to fix this up + * later if the store fails. + */ + wreq->cleanup = netfs_cleanup_buffered_write; + + iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start, + wreq->upper_len); + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback); + if (ret == 0 || ret == -EIOCBQUEUED) + wbc->nr_to_write -= len / PAGE_SIZE; + } else { + _debug("write discard %zx @%llx [%llx]", len, start, i_size); + + /* The dirty region was entirely beyond the EOF. */ + fscache_clear_page_bits(mapping, start, len, caching); + netfs_pages_written_back(wreq); + ret = 0; + } + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + _leave(" = 1"); + return 1; +} + +/* + * Write a region of pages back to the server + */ +static ssize_t netfs_writepages_begin(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + struct xa_state *xas, + unsigned long long *_start, + unsigned long long end) +{ + const struct netfs_folio *finfo; + struct folio *folio; + unsigned long long start = *_start; + ssize_t ret; + void *priv; + int skips = 0; + + _enter("%llx,%llx,", start, end); + +search_again: + /* Find the first dirty page in the group. */ + rcu_read_lock(); + + for (;;) { + folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY); + if (xas_retry(xas, folio) || xa_is_value(folio)) + continue; + if (!folio) + break; + + if (!folio_try_get_rcu(folio)) { + xas_reset(xas); + continue; + } + + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); + xas_reset(xas); + continue; + } + + /* Skip any dirty folio that's not in the group of interest. */ + priv = folio_get_private(folio); + if ((const struct netfs_group *)priv != group) { + finfo = netfs_folio_info(folio); + if (finfo->netfs_group != group) { + folio_put(folio); + continue; + } + } + + xas_pause(xas); + break; + } + rcu_read_unlock(); + if (!folio) + return 0; + + start = folio_pos(folio); /* May regress with THPs */ + + _debug("wback %lx", folio->index); + + /* At this point we hold neither the i_pages lock nor the page lock: + * the page may be truncated or invalidated (changing page->mapping to + * NULL), or even swizzled back from swapper_space to tmpfs file + * mapping + */ +lock_again: + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) + return ret; + } else { + if (!folio_trylock(folio)) + goto search_again; + } + + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + goto search_again; + } + + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); +#ifdef CONFIG_FSCACHE + folio_wait_fscache(folio); +#endif + goto lock_again; + } + + start += folio_size(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) { + ret = 0; + goto out; + } + skips++; + } + goto search_again; + } + + ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas, + folio, start, end); +out: + if (ret > 0) + *_start = start + ret; + _leave(" = %zd [%llx]", ret, *_start); + return ret; +} + +/* + * Write a region of pages back to the server + */ +static int netfs_writepages_region(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + unsigned long long *_start, + unsigned long long end) +{ + ssize_t ret; + + XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE); + + do { + ret = netfs_writepages_begin(mapping, wbc, group, &xas, + _start, end); + if (ret > 0 && wbc->nr_to_write > 0) + cond_resched(); + } while (ret > 0 && wbc->nr_to_write > 0); + + return ret > 0 ? 0 : ret; +} + +/* + * write some of the pending data back to the server + */ +int netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct netfs_group *group = NULL; + loff_t start, end; + int ret; + + _enter(""); + + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ + + if (wbc->range_cyclic && mapping->writeback_index) { + start = mapping->writeback_index * PAGE_SIZE; + ret = netfs_writepages_region(mapping, wbc, group, + &start, LLONG_MAX); + if (ret < 0) + goto out; + + if (wbc->nr_to_write <= 0) { + mapping->writeback_index = start / PAGE_SIZE; + goto out; + } + + start = 0; + end = mapping->writeback_index * PAGE_SIZE; + mapping->writeback_index = 0; + ret = netfs_writepages_region(mapping, wbc, group, &start, end); + if (ret == 0) + mapping->writeback_index = start / PAGE_SIZE; + } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { + start = 0; + ret = netfs_writepages_region(mapping, wbc, group, + &start, LLONG_MAX); + if (wbc->nr_to_write > 0 && ret == 0) + mapping->writeback_index = start / PAGE_SIZE; + } else { + start = wbc->range_start; + ret = netfs_writepages_region(mapping, wbc, group, + &start, wbc->range_end); + } + +out: + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_writepages); + +/* + * Deal with the disposition of a laundered folio. + */ +static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq) +{ + if (wreq->error) { + pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error); + mapping_set_error(wreq->mapping, wreq->error); + } +} + +/** + * netfs_launder_folio - Clean up a dirty folio that's being invalidated + * @folio: The folio to clean + * + * This is called to write back a folio that's being invalidated when an inode + * is getting torn down. Ideally, writepages would be used instead. + */ +int netfs_launder_folio(struct folio *folio) +{ + struct netfs_io_request *wreq; + struct address_space *mapping = folio->mapping; + struct netfs_folio *finfo = netfs_folio_info(folio); + struct netfs_group *group = netfs_folio_group(folio); + struct bio_vec bvec; + unsigned long long i_size = i_size_read(mapping->host); + unsigned long long start = folio_pos(folio); + size_t offset = 0, len; + int ret = 0; + + if (finfo) { + offset = finfo->dirty_offset; + start += offset; + len = finfo->dirty_len; + } else { + len = folio_size(folio); + } + len = min_t(unsigned long long, len, i_size - start); + + wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE); + if (IS_ERR(wreq)) { + ret = PTR_ERR(wreq); + goto out; + } + + if (!folio_clear_dirty_for_io(folio)) + goto out_put; + + trace_netfs_folio(folio, netfs_folio_trace_launder); + + _debug("launder %llx-%llx", start, start + len - 1); + + /* Speculatively write to the cache. We have to fix this up later if + * the store fails. + */ + wreq->cleanup = netfs_cleanup_launder_folio; + + bvec_set_folio(&bvec, folio, len, offset); + iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len); + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + ret = netfs_begin_write(wreq, true, netfs_write_trace_launder); + +out_put: + folio_detach_private(folio); + netfs_put_group(group); + kfree(finfo); + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); +out: + folio_wait_fscache(folio); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_launder_folio); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c new file mode 100644 index 000000000000..ad4370b3935d --- /dev/null +++ b/fs/netfs/direct_read.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Direct I/O support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/netfs.h> +#include "internal.h" + +/** + * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read + * @iocb: The I/O control descriptor describing the read + * @iter: The output buffer (also specifies read length) + * + * Perform an unbuffered I/O or direct I/O from the file in @iocb to the + * output buffer. No use is made of the pagecache. + * + * The caller must hold any appropriate locks. + */ +static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) +{ + struct netfs_io_request *rreq; + ssize_t ret; + size_t orig_count = iov_iter_count(iter); + bool async = !is_sync_kiocb(iocb); + + _enter(""); + + if (!orig_count) + return 0; /* Don't update atime */ + + ret = kiocb_write_and_wait(iocb, orig_count); + if (ret < 0) + return ret; + file_accessed(iocb->ki_filp); + + rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, + iocb->ki_pos, orig_count, + NETFS_DIO_READ); + if (IS_ERR(rreq)) + return PTR_ERR(rreq); + + netfs_stat(&netfs_n_rh_dio_read); + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read); + + /* If this is an async op, we have to keep track of the destination + * buffer for ourselves as the caller's iterator will be trashed when + * we return. + * + * In such a case, extract an iterator to represent as much of the the + * output buffer as we can manage. Note that the extraction might not + * be able to allocate a sufficiently large bvec array and may shorten + * the request. + */ + if (user_backed_iter(iter)) { + ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0); + if (ret < 0) + goto out; + rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec; + rreq->direct_bv_count = ret; + rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); + rreq->len = iov_iter_count(&rreq->iter); + } else { + rreq->iter = *iter; + rreq->len = orig_count; + rreq->direct_bv_unpin = false; + iov_iter_advance(iter, orig_count); + } + + // TODO: Set up bounce buffer if needed + + if (async) + rreq->iocb = iocb; + + ret = netfs_begin_read(rreq, is_sync_kiocb(iocb)); + if (ret < 0) + goto out; /* May be -EIOCBQUEUED */ + if (!async) { + // TODO: Copy from bounce buffer + iocb->ki_pos += rreq->transferred; + ret = rreq->transferred; + } + +out: + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + if (ret > 0) + orig_count -= ret; + if (ret != -EIOCBQUEUED) + iov_iter_revert(iter, orig_count - iov_iter_count(iter)); + return ret; +} + +/** + * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read + * @iocb: The I/O control descriptor describing the read + * @iter: The output buffer (also specifies read length) + * + * Perform an unbuffered I/O or direct I/O from the file in @iocb to the + * output buffer. No use is made of the pagecache. + */ +ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (!iter->count) + return 0; /* Don't update atime */ + + ret = netfs_start_io_direct(inode); + if (ret == 0) { + ret = netfs_unbuffered_read_iter_locked(iocb, iter); + netfs_end_io_direct(inode); + } + return ret; +} +EXPORT_SYMBOL(netfs_unbuffered_read_iter); diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c new file mode 100644 index 000000000000..bee047e20f5d --- /dev/null +++ b/fs/netfs/direct_write.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Unbuffered and direct write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/uio.h> +#include "internal.h" + +static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) +{ + struct inode *inode = wreq->inode; + unsigned long long end = wreq->start + wreq->len; + + if (!wreq->error && + i_size_read(inode) < end) { + if (wreq->netfs_ops->update_i_size) + wreq->netfs_ops->update_i_size(inode, end); + else + i_size_write(inode, end); + } +} + +/* + * Perform an unbuffered write where we may have to do an RMW operation on an + * encrypted file. This can also be used for direct I/O writes. + */ +static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group) +{ + struct netfs_io_request *wreq; + unsigned long long start = iocb->ki_pos; + unsigned long long end = start + iov_iter_count(iter); + ssize_t ret, n; + bool async = !is_sync_kiocb(iocb); + + _enter(""); + + /* We're going to need a bounce buffer if what we transmit is going to + * be different in some way to the source buffer, e.g. because it gets + * encrypted/compressed or because it needs expanding to a block size. + */ + // TODO + + _debug("uw %llx-%llx", start, end); + + wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, + start, end - start, + iocb->ki_flags & IOCB_DIRECT ? + NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); + if (IS_ERR(wreq)) + return PTR_ERR(wreq); + + { + /* If this is an async op and we're not using a bounce buffer, + * we have to save the source buffer as the iterator is only + * good until we return. In such a case, extract an iterator + * to represent as much of the the output buffer as we can + * manage. Note that the extraction might not be able to + * allocate a sufficiently large bvec array and may shorten the + * request. + */ + if (async || user_backed_iter(iter)) { + n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0); + if (n < 0) { + ret = n; + goto out; + } + wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec; + wreq->direct_bv_count = n; + wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); + wreq->len = iov_iter_count(&wreq->iter); + } else { + wreq->iter = *iter; + } + + wreq->io_iter = wreq->iter; + } + + /* Copy the data into the bounce buffer and encrypt it. */ + // TODO + + /* Dispatch the write. */ + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + if (async) + wreq->iocb = iocb; + wreq->cleanup = netfs_cleanup_dio_write; + ret = netfs_begin_write(wreq, is_sync_kiocb(iocb), + iocb->ki_flags & IOCB_DIRECT ? + netfs_write_trace_dio_write : + netfs_write_trace_unbuffered_write); + if (ret < 0) { + _debug("begin = %zd", ret); + goto out; + } + + if (!async) { + trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + + ret = wreq->error; + _debug("waited = %zd", ret); + if (ret == 0) { + ret = wreq->transferred; + iocb->ki_pos += ret; + } + } else { + ret = -EIOCBQUEUED; + } + +out: + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} + +/** + * netfs_unbuffered_write_iter - Unbuffered write to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Do an unbuffered write to a file, writing the data directly to the server + * and not lodging the data in the pagecache. + * + * Return: + * * Negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * Number of bytes written, even for truncated writes + */ +ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + unsigned long long end; + ssize_t ret; + + _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + + if (!iov_iter_count(from)) + return 0; + + trace_netfs_write_iter(iocb, from); + netfs_stat(&netfs_n_rh_dio_write); + + ret = netfs_start_io_direct(inode); + if (ret < 0) + return ret; + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + ret = file_remove_privs(file); + if (ret < 0) + goto out; + ret = file_update_time(file); + if (ret < 0) + goto out; + ret = kiocb_invalidate_pages(iocb, iov_iter_count(from)); + if (ret < 0) + goto out; + end = iocb->ki_pos + iov_iter_count(from); + if (end > ictx->zero_point) + ictx->zero_point = end; + + fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), + FSCACHE_INVAL_DIO_WRITE); + ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL); +out: + netfs_end_io_direct(inode); + return ret; +} +EXPORT_SYMBOL(netfs_unbuffered_write_iter); diff --git a/fs/fscache/cache.c b/fs/netfs/fscache_cache.c index d645f8b302a2..9397ed39b0b4 100644 --- a/fs/fscache/cache.c +++ b/fs/netfs/fscache_cache.c @@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache); void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where) { - unsigned int debug_id = cache->debug_id; + unsigned int debug_id; bool zero; int ref; if (IS_ERR_OR_NULL(cache)) return; + debug_id = cache->debug_id; zero = __refcount_dec_and_test(&cache->ref, &ref); trace_fscache_cache(debug_id, ref - 1, where); diff --git a/fs/fscache/cookie.c b/fs/netfs/fscache_cookie.c index bce2492186d0..bce2492186d0 100644 --- a/fs/fscache/cookie.c +++ b/fs/netfs/fscache_cookie.c diff --git a/fs/netfs/fscache_internal.h b/fs/netfs/fscache_internal.h new file mode 100644 index 000000000000..a09b948fcef2 --- /dev/null +++ b/fs/netfs/fscache_internal.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Internal definitions for FS-Cache + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include "internal.h" + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "FS-Cache: " fmt diff --git a/fs/fscache/io.c b/fs/netfs/fscache_io.c index 0d2b8dec8f82..ad572f7ee897 100644 --- a/fs/fscache/io.c +++ b/fs/netfs/fscache_io.c @@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres, } EXPORT_SYMBOL(__fscache_begin_write_operation); -/** - * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback - * @mapping: The mapping the folio belongs to. - * @folio: The folio being dirtied. - * @cookie: The cookie referring to the cache object - * - * Set the dirty flag on a folio and pin an in-use cache object in memory - * so that writeback can later write to it. This is intended - * to be called from the filesystem's ->dirty_folio() method. - * - * Return: true if the dirty flag was set on the folio, false otherwise. - */ -bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio, - struct fscache_cookie *cookie) -{ - struct inode *inode = mapping->host; - bool need_use = false; - - _enter(""); - - if (!filemap_dirty_folio(mapping, folio)) - return false; - if (!fscache_cookie_valid(cookie)) - return true; - - if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { - spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { - inode->i_state |= I_PINNING_FSCACHE_WB; - need_use = true; - } - spin_unlock(&inode->i_lock); - - if (need_use) - fscache_use_cookie(cookie, true); - } - return true; -} -EXPORT_SYMBOL(fscache_dirty_folio); - struct fscache_write_request { struct netfs_cache_resources cache_resources; struct address_space *mapping; @@ -277,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, fscache_access_io_write) < 0) goto abandon_free; - ret = cres->ops->prepare_write(cres, &start, &len, i_size, false); + ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false); if (ret < 0) goto abandon_end; diff --git a/fs/fscache/main.c b/fs/netfs/fscache_main.c index dad85fd84f6f..42e98bb523e3 100644 --- a/fs/fscache/main.c +++ b/fs/netfs/fscache_main.c @@ -8,18 +8,9 @@ #define FSCACHE_DEBUG_LEVEL CACHE #include <linux/module.h> #include <linux/init.h> -#define CREATE_TRACE_POINTS #include "internal.h" - -MODULE_DESCRIPTION("FS Cache Manager"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - -unsigned fscache_debug; -module_param_named(debug, fscache_debug, uint, - S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(fscache_debug, - "FS-Cache debugging mask"); +#define CREATE_TRACE_POINTS +#include <trace/events/fscache.h> EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache); EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume); @@ -71,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len) /* * initialise the fs caching module */ -static int __init fscache_init(void) +int __init fscache_init(void) { int ret = -ENOMEM; @@ -92,7 +83,7 @@ static int __init fscache_init(void) goto error_cookie_jar; } - pr_notice("Loaded\n"); + pr_notice("FS-Cache loaded\n"); return 0; error_cookie_jar: @@ -103,19 +94,15 @@ error_wq: return ret; } -fs_initcall(fscache_init); - /* * clean up on module removal */ -static void __exit fscache_exit(void) +void __exit fscache_exit(void) { _enter(""); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); destroy_workqueue(fscache_wq); - pr_notice("Unloaded\n"); + pr_notice("FS-Cache unloaded\n"); } - -module_exit(fscache_exit); diff --git a/fs/fscache/proc.c b/fs/netfs/fscache_proc.c index dc3b0e9c8cce..874d951bc390 100644 --- a/fs/fscache/proc.c +++ b/fs/netfs/fscache_proc.c @@ -12,41 +12,34 @@ #include "internal.h" /* - * initialise the /proc/fs/fscache/ directory + * Add files to /proc/fs/netfs/. */ int __init fscache_proc_init(void) { - if (!proc_mkdir("fs/fscache", NULL)) - goto error_dir; + if (!proc_symlink("fs/fscache", NULL, "netfs")) + goto error_sym; - if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL, &fscache_caches_seq_ops)) goto error; - if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL, &fscache_volumes_seq_ops)) goto error; - if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL, &fscache_cookies_seq_ops)) goto error; - -#ifdef CONFIG_FSCACHE_STATS - if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, - fscache_stats_show)) - goto error; -#endif - return 0; error: remove_proc_entry("fs/fscache", NULL); -error_dir: +error_sym: return -ENOMEM; } /* - * clean up the /proc/fs/fscache/ directory + * Clean up the /proc/fs/fscache symlink. */ void fscache_proc_cleanup(void) { diff --git a/fs/fscache/stats.c b/fs/netfs/fscache_stats.c index fc94e5e79f1c..add21abdf713 100644 --- a/fs/fscache/stats.c +++ b/fs/netfs/fscache_stats.c @@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space; EXPORT_SYMBOL(fscache_n_no_create_space); atomic_t fscache_n_culled; EXPORT_SYMBOL(fscache_n_culled); +atomic_t fscache_n_dio_misfit; +EXPORT_SYMBOL(fscache_n_dio_misfit); /* * display the general statistics */ -int fscache_stats_show(struct seq_file *m, void *v) +int fscache_stats_show(struct seq_file *m) { - seq_puts(m, "FS-Cache statistics\n"); + seq_puts(m, "-- FS-Cache statistics --\n"); seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n", atomic_read(&fscache_n_cookies), atomic_read(&fscache_n_volumes), @@ -93,10 +95,9 @@ int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_no_create_space), atomic_read(&fscache_n_culled)); - seq_printf(m, "IO : rd=%u wr=%u\n", + seq_printf(m, "IO : rd=%u wr=%u mis=%u\n", atomic_read(&fscache_n_read), - atomic_read(&fscache_n_write)); - - netfs_stats_show(m); + atomic_read(&fscache_n_write), + atomic_read(&fscache_n_dio_misfit)); return 0; } diff --git a/fs/fscache/volume.c b/fs/netfs/fscache_volume.c index cdf991bdd9de..cdf991bdd9de 100644 --- a/fs/fscache/volume.c +++ b/fs/netfs/fscache_volume.c diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 43fac1b14e40..ec7045d24400 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -5,9 +5,13 @@ * Written by David Howells (dhowells@redhat.com) */ +#include <linux/slab.h> +#include <linux/seq_file.h> #include <linux/netfs.h> #include <linux/fscache.h> +#include <linux/fscache-cache.h> #include <trace/events/netfs.h> +#include <trace/events/fscache.h> #ifdef pr_fmt #undef pr_fmt @@ -19,6 +23,8 @@ * buffered_read.c */ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); +int netfs_prefetch_for_write(struct file *file, struct folio *folio, + size_t offset, size_t len); /* * io.c @@ -29,6 +35,41 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync); * main.c */ extern unsigned int netfs_debug; +extern struct list_head netfs_io_requests; +extern spinlock_t netfs_proc_lock; + +#ifdef CONFIG_PROC_FS +static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) +{ + spin_lock(&netfs_proc_lock); + list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests); + spin_unlock(&netfs_proc_lock); +} +static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) +{ + if (!list_empty(&rreq->proc_link)) { + spin_lock(&netfs_proc_lock); + list_del_rcu(&rreq->proc_link); + spin_unlock(&netfs_proc_lock); + } +} +#else +static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {} +static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} +#endif + +/* + * misc.c + */ +#define NETFS_FLAG_PUT_MARK BIT(0) +#define NETFS_FLAG_PAGECACHE_MARK BIT(1) +int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, + struct folio *folio, unsigned int flags, + gfp_t gfp_mask); +int netfs_add_folios_to_buffer(struct xarray *buffer, + struct address_space *mapping, + pgoff_t index, pgoff_t to, gfp_t gfp_mask); +void netfs_clear_buffer(struct xarray *buffer); /* * objects.c @@ -50,9 +91,20 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, } /* + * output.c + */ +int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, + enum netfs_write_trace what); +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); +int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end); +int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb); + +/* * stats.c */ #ifdef CONFIG_NETFS_STATS +extern atomic_t netfs_n_rh_dio_read; +extern atomic_t netfs_n_rh_dio_write; extern atomic_t netfs_n_rh_readahead; extern atomic_t netfs_n_rh_readpage; extern atomic_t netfs_n_rh_rreq; @@ -71,7 +123,15 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_wh_wstream_conflict; +extern atomic_t netfs_n_wh_upload; +extern atomic_t netfs_n_wh_upload_done; +extern atomic_t netfs_n_wh_upload_failed; +extern atomic_t netfs_n_wh_write; +extern atomic_t netfs_n_wh_write_done; +extern atomic_t netfs_n_wh_write_failed; +int netfs_stats_show(struct seq_file *m, void *v); static inline void netfs_stat(atomic_t *stat) { @@ -103,6 +163,176 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) #endif } +/* + * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) +{ + if (netfs_group) + refcount_inc(&netfs_group->ref); + return netfs_group; +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group(struct netfs_group *netfs_group) +{ + if (netfs_group && refcount_dec_and_test(&netfs_group->ref)) + netfs_group->free(netfs_group); +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) +{ + if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref)) + netfs_group->free(netfs_group); +} + +/* + * fscache-cache.c + */ +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_caches_seq_ops; +#endif +bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); +void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); + +static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) +{ + return smp_load_acquire(&cache->state); +} + +static inline bool fscache_cache_is_live(const struct fscache_cache *cache) +{ + return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; +} + +static inline void fscache_set_cache_state(struct fscache_cache *cache, + enum fscache_cache_state new_state) +{ + smp_store_release(&cache->state, new_state); + +} + +static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, + enum fscache_cache_state old_state, + enum fscache_cache_state new_state) +{ + return try_cmpxchg_release(&cache->state, &old_state, new_state); +} + +/* + * fscache-cookie.c + */ +extern struct kmem_cache *fscache_cookie_jar; +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_cookies_seq_ops; +#endif +extern struct timer_list fscache_cookie_lru_timer; + +extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); +extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why); + +static inline void fscache_see_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), + where); +} + +/* + * fscache-main.c + */ +extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); +#ifdef CONFIG_FSCACHE +int __init fscache_init(void); +void __exit fscache_exit(void); +#else +static inline int fscache_init(void) { return 0; } +static inline void fscache_exit(void) {} +#endif + +/* + * fscache-proc.c + */ +#ifdef CONFIG_PROC_FS +extern int __init fscache_proc_init(void); +extern void fscache_proc_cleanup(void); +#else +#define fscache_proc_init() (0) +#define fscache_proc_cleanup() do {} while (0) +#endif + +/* + * fscache-stats.c + */ +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_volumes; +extern atomic_t fscache_n_volumes_collision; +extern atomic_t fscache_n_volumes_nomem; +extern atomic_t fscache_n_cookies; +extern atomic_t fscache_n_cookies_lru; +extern atomic_t fscache_n_cookies_lru_expired; +extern atomic_t fscache_n_cookies_lru_removed; +extern atomic_t fscache_n_cookies_lru_dropped; + +extern atomic_t fscache_n_acquires; +extern atomic_t fscache_n_acquires_ok; +extern atomic_t fscache_n_acquires_oom; + +extern atomic_t fscache_n_invalidates; + +extern atomic_t fscache_n_relinquishes; +extern atomic_t fscache_n_relinquishes_retire; +extern atomic_t fscache_n_relinquishes_dropped; + +extern atomic_t fscache_n_resizes; +extern atomic_t fscache_n_resizes_null; + +static inline void fscache_stat(atomic_t *stat) +{ + atomic_inc(stat); +} + +static inline void fscache_stat_d(atomic_t *stat) +{ + atomic_dec(stat); +} + +#define __fscache_stat(stat) (stat) + +int fscache_stats_show(struct seq_file *m); +#else + +#define __fscache_stat(stat) (NULL) +#define fscache_stat(stat) do {} while (0) +#define fscache_stat_d(stat) do {} while (0) + +static inline int fscache_stats_show(struct seq_file *m) { return 0; } +#endif + +/* + * fscache-volume.c + */ +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_volumes_seq_ops; +#endif + +struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +bool fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why); +void fscache_create_volume(struct fscache_volume *volume, bool wait); + /*****************************************************************************/ /* * debug tracing @@ -143,3 +373,57 @@ do { \ #define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) #define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) #endif + +/* + * assertions + */ +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif /* assert or not */ diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 7f753380e047..4261ad6c55b6 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -21,12 +21,7 @@ */ static void netfs_clear_unread(struct netfs_io_subrequest *subreq) { - struct iov_iter iter; - - iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - iov_iter_zero(iov_iter_count(&iter), &iter); + iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); } static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, @@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq, enum netfs_read_from_hole read_hole) { struct netfs_cache_resources *cres = &rreq->cache_resources; - struct iov_iter iter; netfs_stat(&netfs_n_rh_read); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - - cres->ops->read(cres, subreq->start, &iter, read_hole, + cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole, netfs_cache_read_terminated, subreq); } @@ -88,6 +78,13 @@ static void netfs_read_from_server(struct netfs_io_request *rreq, struct netfs_io_subrequest *subreq) { netfs_stat(&netfs_n_rh_download); + + if (rreq->origin != NETFS_DIO_READ && + iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) + pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n", + rreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->len, + subreq->transferred, subreq->flags); rreq->netfs_ops->issue_read(subreq); } @@ -127,9 +124,10 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, /* We might have multiple writes from the same huge * folio, but we mustn't unlock a folio more than once. */ - if (have_unlocked && folio_index(folio) <= unlocked) + if (have_unlocked && folio->index <= unlocked) continue; - unlocked = folio_index(folio); + unlocked = folio_next_index(folio) - 1; + trace_netfs_folio(folio, netfs_folio_trace_end_copy); folio_end_fscache(folio); have_unlocked = true; } @@ -201,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) } ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - rreq->i_size, true); + subreq->len, rreq->i_size, true); if (ret < 0) { trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); @@ -260,6 +258,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq, } /* + * Reset the subrequest iterator prior to resubmission. + */ +static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + size_t remaining = subreq->len - subreq->transferred; + size_t count = iov_iter_count(&subreq->io_iter); + + if (count == remaining) + return; + + _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", + rreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->transferred, + subreq->len, rreq->i_size, + subreq->io_iter.iter_type); + + if (count < remaining) + iov_iter_revert(&subreq->io_iter, remaining - count); + else + iov_iter_advance(&subreq->io_iter, count - remaining); +} + +/* * Resubmit any short or failed operations. Returns true if we got the rreq * ref back. */ @@ -287,6 +309,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); atomic_inc(&rreq->nr_outstanding); + netfs_reset_subreq_iter(rreq, subreq); netfs_read_from_server(rreq, subreq); } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { netfs_rreq_short_read(rreq, subreq); @@ -321,6 +344,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq) } /* + * Determine how much we can admit to having read from a DIO read. + */ +static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + unsigned int i; + size_t transferred = 0; + + for (i = 0; i < rreq->direct_bv_count; i++) + flush_dcache_page(rreq->direct_bv[i].bv_page); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->error || subreq->transferred == 0) + break; + transferred += subreq->transferred; + if (subreq->transferred < subreq->len) + break; + } + + for (i = 0; i < rreq->direct_bv_count; i++) + flush_dcache_page(rreq->direct_bv[i].bv_page); + + rreq->transferred = transferred; + task_io_account_read(transferred); + + if (rreq->iocb) { + rreq->iocb->ki_pos += transferred; + if (rreq->iocb->ki_complete) + rreq->iocb->ki_complete( + rreq->iocb, rreq->error ? rreq->error : transferred); + } + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); + inode_dio_end(rreq->inode); +} + +/* * Assess the state of a read request and decide what to do next. * * Note that we could be in an ordinary kernel thread, on a workqueue or in @@ -340,8 +400,12 @@ again: return; } - netfs_rreq_unlock_folios(rreq); + if (rreq->origin != NETFS_DIO_READ) + netfs_rreq_unlock_folios(rreq); + else + netfs_rreq_assess_dio(rreq); + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); @@ -399,9 +463,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; int u; - _enter("[%u]{%llx,%lx},%zd", - subreq->debug_index, subreq->start, subreq->flags, - transferred_or_error); + _enter("R=%x[%x]{%llx,%lx},%zd", + rreq->debug_id, subreq->debug_index, + subreq->start, subreq->flags, transferred_or_error); switch (subreq->source) { case NETFS_READ_FROM_CACHE: @@ -501,15 +565,20 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest */ static enum netfs_io_source netfs_rreq_prepare_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) + struct netfs_io_subrequest *subreq, + struct iov_iter *io_iter) { - enum netfs_io_source source; + enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; + struct netfs_inode *ictx = netfs_inode(rreq->inode); + size_t lsize; _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); - source = netfs_cache_prepare_read(subreq, rreq->i_size); - if (source == NETFS_INVALID_READ) - goto out; + if (rreq->origin != NETFS_DIO_READ) { + source = netfs_cache_prepare_read(subreq, rreq->i_size); + if (source == NETFS_INVALID_READ) + goto out; + } if (source == NETFS_DOWNLOAD_FROM_SERVER) { /* Call out to the netfs to let it shrink the request to fit @@ -518,19 +587,52 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, * to make serial calls, it can indicate a short read and then * we will call it again. */ + if (rreq->origin != NETFS_DIO_READ) { + if (subreq->start >= ictx->zero_point) { + source = NETFS_FILL_WITH_ZEROES; + goto set; + } + if (subreq->len > ictx->zero_point - subreq->start) + subreq->len = ictx->zero_point - subreq->start; + } if (subreq->len > rreq->i_size - subreq->start) subreq->len = rreq->i_size - subreq->start; + if (rreq->rsize && subreq->len > rreq->rsize) + subreq->len = rreq->rsize; if (rreq->netfs_ops->clamp_length && !rreq->netfs_ops->clamp_length(subreq)) { source = NETFS_INVALID_READ; goto out; } + + if (subreq->max_nr_segs) { + lsize = netfs_limit_iter(io_iter, 0, subreq->len, + subreq->max_nr_segs); + if (subreq->len > lsize) { + subreq->len = lsize; + trace_netfs_sreq(subreq, netfs_sreq_trace_limited); + } + } } - if (WARN_ON(subreq->len == 0)) +set: + if (subreq->len > rreq->len) + pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n", + rreq->debug_id, subreq->debug_index, + subreq->len, rreq->len); + + if (WARN_ON(subreq->len == 0)) { source = NETFS_INVALID_READ; + goto out; + } + subreq->source = source; + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + + subreq->io_iter = *io_iter; + iov_iter_truncate(&subreq->io_iter, subreq->len); + iov_iter_advance(io_iter, subreq->len); out: subreq->source = source; trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); @@ -541,6 +643,7 @@ out: * Slice off a piece of a read request and submit an I/O request for it. */ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, + struct iov_iter *io_iter, unsigned int *_debug_index) { struct netfs_io_subrequest *subreq; @@ -552,7 +655,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, subreq->debug_index = (*_debug_index)++; subreq->start = rreq->start + rreq->submitted; - subreq->len = rreq->len - rreq->submitted; + subreq->len = io_iter->count; _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); list_add_tail(&subreq->rreq_link, &rreq->subrequests); @@ -565,7 +668,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, * (the starts must coincide), in which case, we go around the loop * again and ask it to download the next piece. */ - source = netfs_rreq_prepare_read(rreq, subreq); + source = netfs_rreq_prepare_read(rreq, subreq, io_iter); if (source == NETFS_INVALID_READ) goto subreq_failed; @@ -603,6 +706,7 @@ subreq_failed: */ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) { + struct iov_iter io_iter; unsigned int debug_index = 0; int ret; @@ -611,50 +715,73 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); - netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len); return -EIO; } - INIT_WORK(&rreq->work, netfs_rreq_work); + if (rreq->origin == NETFS_DIO_READ) + inode_dio_begin(rreq->inode); - if (sync) - netfs_get_request(rreq, netfs_rreq_trace_get_hold); + // TODO: Use bounce buffer if requested + rreq->io_iter = rreq->iter; + + INIT_WORK(&rreq->work, netfs_rreq_work); /* Chop the read into slices according to what the cache and the netfs * want and submit each one. */ + netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding); atomic_set(&rreq->nr_outstanding, 1); + io_iter = rreq->io_iter; do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) + _debug("submit %llx + %zx >= %llx", + rreq->start, rreq->submitted, rreq->i_size); + if (rreq->origin == NETFS_DIO_READ && + rreq->start + rreq->submitted >= rreq->i_size) + break; + if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index)) + break; + if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && + test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) break; } while (rreq->submitted < rreq->len); + if (!rreq->submitted) { + netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + if (rreq->origin == NETFS_DIO_READ) + inode_dio_end(rreq->inode); + ret = 0; + goto out; + } + if (sync) { - /* Keep nr_outstanding incremented so that the ref always belongs to - * us, and the service code isn't punted off to a random thread pool to - * process. + /* Keep nr_outstanding incremented so that the ref always + * belongs to us, and the service code isn't punted off to a + * random thread pool to process. Note that this might start + * further work, such as writing to the cache. */ - for (;;) { - wait_var_event(&rreq->nr_outstanding, - atomic_read(&rreq->nr_outstanding) == 1); + wait_var_event(&rreq->nr_outstanding, + atomic_read(&rreq->nr_outstanding) == 1); + if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_assess(rreq, false); - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - break; - cond_resched(); - } + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len) { + if (ret == 0 && rreq->submitted < rreq->len && + rreq->origin != NETFS_DIO_READ) { trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); ret = -EIO; } - netfs_put_request(rreq, false, netfs_rreq_trace_put_hold); } else { /* If we decrement nr_outstanding to 0, the ref belongs to us. */ if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_assess(rreq, false); - ret = 0; + ret = -EIOCBQUEUED; } + +out: return ret; } diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index 2ff07ba655a0..b781bbbf1d8d 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, return npages; } EXPORT_SYMBOL_GPL(netfs_extract_user_iter); + +/* + * Select the span of a bvec iterator we're going to use. Limit it by both maximum + * size and maximum number of segments. Returns the size of the span in bytes. + */ +static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + const struct bio_vec *bvecs = iter->bvec; + unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0; + size_t len, span = 0, n = iter->count; + size_t skip = iter->iov_offset + start_offset; + + if (WARN_ON(!iov_iter_is_bvec(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + + while (n && ix < nbv && skip) { + len = bvecs[ix].bv_len; + if (skip < len) + break; + skip -= len; + n -= len; + ix++; + } + + while (n && ix < nbv) { + len = min3(n, bvecs[ix].bv_len - skip, max_size); + span += len; + nsegs++; + ix++; + if (span >= max_size || nsegs >= max_segs) + break; + skip = 0; + n -= len; + } + + return min(span, max_size); +} + +/* + * Select the span of an xarray iterator we're going to use. Limit it by both + * maximum size and maximum number of segments. It is assumed that segments + * can be larger than a page in size, provided they're physically contiguous. + * Returns the size of the span in bytes. + */ +static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + struct folio *folio; + unsigned int nsegs = 0; + loff_t pos = iter->xarray_start + iter->iov_offset; + pgoff_t index = pos / PAGE_SIZE; + size_t span = 0, n = iter->count; + + XA_STATE(xas, iter->xarray, index); + + if (WARN_ON(!iov_iter_is_xarray(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + max_size = min(max_size, n - start_offset); + + rcu_read_lock(); + xas_for_each(&xas, folio, ULONG_MAX) { + size_t offset, flen, len; + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + flen = folio_size(folio); + offset = offset_in_folio(folio, pos); + len = min(max_size, flen - offset); + span += len; + nsegs++; + if (span >= max_size || nsegs >= max_segs) + break; + } + + rcu_read_unlock(); + return min(span, max_size); +} + +size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + if (iov_iter_is_bvec(iter)) + return netfs_limit_bvec(iter, start_offset, max_size, max_segs); + if (iov_iter_is_xarray(iter)) + return netfs_limit_xarray(iter, start_offset, max_size, max_segs); + BUG(); +} +EXPORT_SYMBOL(netfs_limit_iter); diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c new file mode 100644 index 000000000000..75dc52a49b3a --- /dev/null +++ b/fs/netfs/locking.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * I/O and data path helper functionality. + * + * Borrowed from NFS Copyright (c) 2016 Trond Myklebust + */ + +#include <linux/kernel.h> +#include <linux/netfs.h> +#include "internal.h" + +/* + * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +static int inode_dio_wait_interruptible(struct inode *inode) +{ + if (!atomic_read(&inode->i_dio_count)) + return 0; + + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + for (;;) { + prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE); + if (!atomic_read(&inode->i_dio_count)) + break; + if (signal_pending(current)) + break; + schedule(); + } + finish_wait(wq, &q.wq_entry); + + return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0; +} + +/* Call with exclusively locked inode->i_rwsem */ +static int netfs_block_o_direct(struct netfs_inode *ictx) +{ + if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) + return 0; + clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + return inode_dio_wait_interruptible(&ictx->inode); +} + +/** + * netfs_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NETFS_ICTX_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +int netfs_start_io_read(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + /* Be an optimist! */ + if (down_read_interruptible(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (netfs_block_o_direct(ictx) < 0) { + up_write(&inode->i_rwsem); + return -ERESTARTSYS; + } + downgrade_write(&inode->i_rwsem); + return 0; +} +EXPORT_SYMBOL(netfs_start_io_read); + +/** + * netfs_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void netfs_end_io_read(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_read(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_read); + +/** + * netfs_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +int netfs_start_io_write(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (netfs_block_o_direct(ictx) < 0) { + up_write(&inode->i_rwsem); + return -ERESTARTSYS; + } + return 0; +} +EXPORT_SYMBOL(netfs_start_io_write); + +/** + * netfs_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void netfs_end_io_write(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_write(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_write); + +/* Call with exclusively locked inode->i_rwsem */ +static int netfs_block_buffered(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + int ret; + + if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) { + set_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + if (inode->i_mapping->nrpages != 0) { + unmap_mapping_range(inode->i_mapping, 0, 0, 0); + ret = filemap_fdatawait(inode->i_mapping); + if (ret < 0) { + clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + return ret; + } + } + } + return 0; +} + +/** + * netfs_start_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NETFS_ICTX_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +int netfs_start_io_direct(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + int ret; + + /* Be an optimist! */ + if (down_read_interruptible(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + ret = netfs_block_buffered(inode); + if (ret < 0) { + up_write(&inode->i_rwsem); + return ret; + } + downgrade_write(&inode->i_rwsem); + return 0; +} +EXPORT_SYMBOL(netfs_start_io_direct); + +/** + * netfs_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void netfs_end_io_direct(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_read(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_direct); diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 068568702957..5e77618a7940 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -7,6 +7,8 @@ #include <linux/module.h> #include <linux/export.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/netfs.h> @@ -15,6 +17,113 @@ MODULE_DESCRIPTION("Network fs support"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); +EXPORT_TRACEPOINT_SYMBOL(netfs_sreq); + unsigned netfs_debug; module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); + +#ifdef CONFIG_PROC_FS +LIST_HEAD(netfs_io_requests); +DEFINE_SPINLOCK(netfs_proc_lock); + +static const char *netfs_origins[nr__netfs_io_origin] = { + [NETFS_READAHEAD] = "RA", + [NETFS_READPAGE] = "RP", + [NETFS_READ_FOR_WRITE] = "RW", + [NETFS_WRITEBACK] = "WB", + [NETFS_WRITETHROUGH] = "WT", + [NETFS_LAUNDER_WRITE] = "LW", + [NETFS_UNBUFFERED_WRITE] = "UW", + [NETFS_DIO_READ] = "DR", + [NETFS_DIO_WRITE] = "DW", +}; + +/* + * Generate a list of I/O requests in /proc/fs/netfs/requests + */ +static int netfs_requests_seq_show(struct seq_file *m, void *v) +{ + struct netfs_io_request *rreq; + + if (v == &netfs_io_requests) { + seq_puts(m, + "REQUEST OR REF FL ERR OPS COVERAGE\n" + "======== == === == ==== === =========\n" + ); + return 0; + } + + rreq = list_entry(v, struct netfs_io_request, proc_link); + seq_printf(m, + "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx", + rreq->debug_id, + netfs_origins[rreq->origin], + refcount_read(&rreq->ref), + rreq->flags, + rreq->error, + atomic_read(&rreq->nr_outstanding), + rreq->start, rreq->submitted, rreq->len); + seq_putc(m, '\n'); + return 0; +} + +static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) +{ + rcu_read_lock(); + return seq_list_start_head(&netfs_io_requests, *_pos); +} + +static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &netfs_io_requests, _pos); +} + +static void netfs_requests_seq_stop(struct seq_file *m, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +static const struct seq_operations netfs_requests_seq_ops = { + .start = netfs_requests_seq_start, + .next = netfs_requests_seq_next, + .stop = netfs_requests_seq_stop, + .show = netfs_requests_seq_show, +}; +#endif /* CONFIG_PROC_FS */ + +static int __init netfs_init(void) +{ + int ret = -ENOMEM; + + if (!proc_mkdir("fs/netfs", NULL)) + goto error; + if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL, + &netfs_requests_seq_ops)) + goto error_proc; +#ifdef CONFIG_FSCACHE_STATS + if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, + netfs_stats_show)) + goto error_proc; +#endif + + ret = fscache_init(); + if (ret < 0) + goto error_proc; + return 0; + +error_proc: + remove_proc_entry("fs/netfs", NULL); +error: + return ret; +} +fs_initcall(netfs_init); + +static void __exit netfs_exit(void) +{ + fscache_exit(); + remove_proc_entry("fs/netfs", NULL); +} +module_exit(netfs_exit); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c new file mode 100644 index 000000000000..90051ced8e2a --- /dev/null +++ b/fs/netfs/misc.c @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Miscellaneous routines. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/swap.h> +#include "internal.h" + +/* + * Attach a folio to the buffer and maybe set marks on it to say that we need + * to put the folio later and twiddle the pagecache flags. + */ +int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, + struct folio *folio, unsigned int flags, + gfp_t gfp_mask) +{ + XA_STATE_ORDER(xas, xa, index, folio_order(folio)); + +retry: + xas_lock(&xas); + for (;;) { + xas_store(&xas, folio); + if (!xas_error(&xas)) + break; + xas_unlock(&xas); + if (!xas_nomem(&xas, gfp_mask)) + return xas_error(&xas); + goto retry; + } + + if (flags & NETFS_FLAG_PUT_MARK) + xas_set_mark(&xas, NETFS_BUF_PUT_MARK); + if (flags & NETFS_FLAG_PAGECACHE_MARK) + xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK); + xas_unlock(&xas); + return xas_error(&xas); +} + +/* + * Create the specified range of folios in the buffer attached to the read + * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that + * these need freeing later. + */ +int netfs_add_folios_to_buffer(struct xarray *buffer, + struct address_space *mapping, + pgoff_t index, pgoff_t to, gfp_t gfp_mask) +{ + struct folio *folio; + int ret; + + if (to + 1 == index) /* Page range is inclusive */ + return 0; + + do { + /* TODO: Figure out what order folio can be allocated here */ + folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0); + if (!folio) + return -ENOMEM; + folio->index = index; + ret = netfs_xa_store_and_mark(buffer, index, folio, + NETFS_FLAG_PUT_MARK, gfp_mask); + if (ret < 0) { + folio_put(folio); + return ret; + } + + index += folio_nr_pages(folio); + } while (index <= to && index != 0); + + return 0; +} + +/* + * Clear an xarray buffer, putting a ref on the folios that have + * NETFS_BUF_PUT_MARK set. + */ +void netfs_clear_buffer(struct xarray *buffer) +{ + struct folio *folio; + XA_STATE(xas, buffer, 0); + + rcu_read_lock(); + xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) { + folio_put(folio); + } + rcu_read_unlock(); + xa_destroy(buffer); +} + +/** + * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback + * @mapping: The mapping the folio belongs to. + * @folio: The folio being dirtied. + * + * Set the dirty flag on a folio and pin an in-use cache object in memory so + * that writeback can later write to it. This is intended to be called from + * the filesystem's ->dirty_folio() method. + * + * Return: true if the dirty flag was set on the folio, false otherwise. + */ +bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + struct inode *inode = mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + struct fscache_cookie *cookie = netfs_i_cookie(ictx); + bool need_use = false; + + _enter(""); + + if (!filemap_dirty_folio(mapping, folio)) + return false; + if (!fscache_cookie_valid(cookie)) + return true; + + if (!(inode->i_state & I_PINNING_NETFS_WB)) { + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_PINNING_NETFS_WB)) { + inode->i_state |= I_PINNING_NETFS_WB; + need_use = true; + } + spin_unlock(&inode->i_lock); + + if (need_use) + fscache_use_cookie(cookie, true); + } + return true; +} +EXPORT_SYMBOL(netfs_dirty_folio); + +/** + * netfs_unpin_writeback - Unpin writeback resources + * @inode: The inode on which the cookie resides + * @wbc: The writeback control + * + * Unpin the writeback resources pinned by netfs_dirty_folio(). This is + * intended to be called as/by the netfs's ->write_inode() method. + */ +int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc) +{ + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + + if (wbc->unpinned_netfs_wb) + fscache_unuse_cookie(cookie, NULL, NULL); + return 0; +} +EXPORT_SYMBOL(netfs_unpin_writeback); + +/** + * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode + * @inode: The inode to clean up + * @aux: Auxiliary data to apply to the inode + * + * Clear any writeback resources held by an inode when the inode is evicted. + * This must be called before clear_inode() is called. + */ +void netfs_clear_inode_writeback(struct inode *inode, const void *aux) +{ + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + + if (inode->i_state & I_PINNING_NETFS_WB) { + loff_t i_size = i_size_read(inode); + fscache_unuse_cookie(cookie, aux, &i_size); + } +} +EXPORT_SYMBOL(netfs_clear_inode_writeback); + +/** + * netfs_invalidate_folio - Invalidate or partially invalidate a folio + * @folio: Folio proposed for release + * @offset: Offset of the invalidated region + * @length: Length of the invalidated region + * + * Invalidate part or all of a folio for a network filesystem. The folio will + * be removed afterwards if the invalidated region covers the entire folio. + */ +void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + struct netfs_folio *finfo = NULL; + size_t flen = folio_size(folio); + + _enter("{%lx},%zx,%zx", folio->index, offset, length); + + folio_wait_fscache(folio); + + if (!folio_test_private(folio)) + return; + + finfo = netfs_folio_info(folio); + + if (offset == 0 && length >= flen) + goto erase_completely; + + if (finfo) { + /* We have a partially uptodate page from a streaming write. */ + unsigned int fstart = finfo->dirty_offset; + unsigned int fend = fstart + finfo->dirty_len; + unsigned int end = offset + length; + + if (offset >= fend) + return; + if (end <= fstart) + return; + if (offset <= fstart && end >= fend) + goto erase_completely; + if (offset <= fstart && end > fstart) + goto reduce_len; + if (offset > fstart && end >= fend) + goto move_start; + /* A partial write was split. The caller has already zeroed + * it, so just absorb the hole. + */ + } + return; + +erase_completely: + netfs_put_group(netfs_folio_group(folio)); + folio_detach_private(folio); + folio_clear_uptodate(folio); + kfree(finfo); + return; +reduce_len: + finfo->dirty_len = offset + length - finfo->dirty_offset; + return; +move_start: + finfo->dirty_len -= offset - finfo->dirty_offset; + finfo->dirty_offset = offset; +} +EXPORT_SYMBOL(netfs_invalidate_folio); + +/** + * netfs_release_folio - Try to release a folio + * @folio: Folio proposed for release + * @gfp: Flags qualifying the release + * + * Request release of a folio and clean up its private state if it's not busy. + * Returns true if the folio can now be released, false if not + */ +bool netfs_release_folio(struct folio *folio, gfp_t gfp) +{ + struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); + unsigned long long end; + + end = folio_pos(folio) + folio_size(folio); + if (end > ctx->zero_point) + ctx->zero_point = end; + + if (folio_test_private(folio)) + return false; + if (folio_test_fscache(folio)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_fscache(folio); + } + + fscache_note_page_release(netfs_i_cookie(ctx)); + return true; +} +EXPORT_SYMBOL(netfs_release_folio); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e17cdf53f6a7..610ceb5bd86c 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -20,14 +20,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct inode *inode = file ? file_inode(file) : mapping->host; struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; + bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || + origin == NETFS_DIO_READ || + origin == NETFS_DIO_WRITE); + bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret; - rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); + rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request), + GFP_KERNEL); if (!rreq) return ERR_PTR(-ENOMEM); rreq->start = start; rreq->len = len; + rreq->upper_len = len; rreq->origin = origin; rreq->netfs_ops = ctx->ops; rreq->mapping = mapping; @@ -35,8 +41,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->i_size = i_size_read(inode); rreq->debug_id = atomic_inc_return(&debug_ids); INIT_LIST_HEAD(&rreq->subrequests); + INIT_WORK(&rreq->work, NULL); refcount_set(&rreq->ref, 1); + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + if (cached) + __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); + if (file && file->f_flags & O_NONBLOCK) + __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { @@ -45,6 +57,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); + netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); return rreq; } @@ -74,33 +88,47 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); + netfs_proc_del_rreq(rreq); netfs_clear_subrequests(rreq, false); if (rreq->netfs_ops->free_request) rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); + if (rreq->direct_bv) { + for (i = 0; i < rreq->direct_bv_count; i++) { + if (rreq->direct_bv[i].bv_page) { + if (rreq->direct_bv_unpin) + unpin_user_page(rreq->direct_bv[i].bv_page); + } + } + kvfree(rreq->direct_bv); + } + kfree_rcu(rreq, rcu); netfs_stat_d(&netfs_n_rh_rreq); } void netfs_put_request(struct netfs_io_request *rreq, bool was_async, enum netfs_rreq_ref_trace what) { - unsigned int debug_id = rreq->debug_id; + unsigned int debug_id; bool dead; int r; - dead = __refcount_dec_and_test(&rreq->ref, &r); - trace_netfs_rreq_ref(debug_id, r - 1, what); - if (dead) { - if (was_async) { - rreq->work.func = netfs_free_request; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_free_request(&rreq->work); + if (rreq) { + debug_id = rreq->debug_id; + dead = __refcount_dec_and_test(&rreq->ref, &r); + trace_netfs_rreq_ref(debug_id, r - 1, what); + if (dead) { + if (was_async) { + rreq->work.func = netfs_free_request; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_free_request(&rreq->work); + } } } } @@ -112,8 +140,11 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq { struct netfs_io_subrequest *subreq; - subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL); + subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?: + sizeof(struct netfs_io_subrequest), + GFP_KERNEL); if (subreq) { + INIT_WORK(&subreq->work, NULL); INIT_LIST_HEAD(&subreq->rreq_link); refcount_set(&subreq->ref, 2); subreq->rreq = rreq; @@ -140,6 +171,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; trace_netfs_sreq(subreq, netfs_sreq_trace_free); + if (rreq->netfs_ops->free_subrequest) + rreq->netfs_ops->free_subrequest(subreq); kfree(subreq); netfs_stat_d(&netfs_n_rh_sreq); netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); diff --git a/fs/netfs/output.c b/fs/netfs/output.c new file mode 100644 index 000000000000..625eb68f3e5a --- /dev/null +++ b/fs/netfs/output.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "internal.h" + +/** + * netfs_create_write_request - Create a write operation. + * @wreq: The write request this is storing from. + * @dest: The destination type + * @start: Start of the region this write will modify + * @len: Length of the modification + * @worker: The worker function to handle the write(s) + * + * Allocate a write operation, set it up and add it to the list on a write + * request. + */ +struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq, + enum netfs_io_source dest, + loff_t start, size_t len, + work_func_t worker) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_alloc_subrequest(wreq); + if (subreq) { + INIT_WORK(&subreq->work, worker); + subreq->source = dest; + subreq->start = start; + subreq->len = len; + subreq->debug_index = wreq->subreq_counter++; + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); + break; + default: + BUG(); + } + + subreq->io_iter = wreq->io_iter; + iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start); + iov_iter_truncate(&subreq->io_iter, subreq->len); + + trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + atomic_inc(&wreq->nr_outstanding); + list_add_tail(&subreq->rreq_link, &wreq->subrequests); + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + } + + return subreq; +} +EXPORT_SYMBOL(netfs_create_write_request); + +/* + * Process a completed write request once all the component operations have + * been completed. + */ +static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async) +{ + struct netfs_io_subrequest *subreq; + struct netfs_inode *ctx = netfs_inode(wreq->inode); + size_t transferred = 0; + + _enter("R=%x[]", wreq->debug_id); + + trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); + + list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { + if (subreq->error || subreq->transferred == 0) + break; + transferred += subreq->transferred; + if (subreq->transferred < subreq->len) + break; + } + wreq->transferred = transferred; + + list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { + if (!subreq->error) + continue; + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + /* Depending on the type of failure, this may prevent + * writeback completion unless we're in disconnected + * mode. + */ + if (!wreq->error) + wreq->error = subreq->error; + break; + + case NETFS_WRITE_TO_CACHE: + /* Failure doesn't prevent writeback completion unless + * we're in disconnected mode. + */ + if (subreq->error != -ENOBUFS) + ctx->ops->invalidate_cache(wreq); + break; + + default: + WARN_ON_ONCE(1); + if (!wreq->error) + wreq->error = -EIO; + return; + } + } + + wreq->cleanup(wreq); + + if (wreq->origin == NETFS_DIO_WRITE && + wreq->mapping->nrpages) { + pgoff_t first = wreq->start >> PAGE_SHIFT; + pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; + invalidate_inode_pages2_range(wreq->mapping, first, last); + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_end(wreq->inode); + + _debug("finished"); + trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); + wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + + if (wreq->iocb) { + wreq->iocb->ki_pos += transferred; + if (wreq->iocb->ki_complete) + wreq->iocb->ki_complete( + wreq->iocb, wreq->error ? wreq->error : transferred); + } + + netfs_clear_subrequests(wreq, was_async); + netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete); +} + +/* + * Deal with the completion of writing the data to the cache. + */ +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = _op; + struct netfs_io_request *wreq = subreq->rreq; + unsigned int u; + + _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_done); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_done); + break; + case NETFS_INVALID_WRITE: + break; + default: + BUG(); + } + + if (IS_ERR_VALUE(transferred_or_error)) { + subreq->error = transferred_or_error; + trace_netfs_failure(wreq, subreq, transferred_or_error, + netfs_fail_write); + goto failed; + } + + if (WARN(transferred_or_error > subreq->len - subreq->transferred, + "Subreq excess write: R%x[%x] %zd > %zu - %zu", + wreq->debug_id, subreq->debug_index, + transferred_or_error, subreq->len, subreq->transferred)) + transferred_or_error = subreq->len - subreq->transferred; + + subreq->error = 0; + subreq->transferred += transferred_or_error; + + if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) + pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n", + wreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->len, + subreq->transferred, subreq->io_iter.iter_type); + + if (subreq->transferred < subreq->len) + goto incomplete; + + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); +out: + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + /* If we decrement nr_outstanding to 0, the ref belongs to us. */ + u = atomic_dec_return(&wreq->nr_outstanding); + if (u == 0) + netfs_write_terminated(wreq, was_async); + else if (u == 1) + wake_up_var(&wreq->nr_outstanding); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + return; + +incomplete: + if (transferred_or_error == 0) { + if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + subreq->error = -ENODATA; + goto failed; + } + } else { + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } + + __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); + goto out; + +failed: + switch (subreq->source) { + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_failed); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); + break; + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_failed); + set_bit(NETFS_RREQ_FAILED, &wreq->flags); + wreq->error = subreq->error; + break; + default: + break; + } + goto out; +} +EXPORT_SYMBOL(netfs_write_subrequest_terminated); + +static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *wreq = subreq->rreq; + struct netfs_cache_resources *cres = &wreq->cache_resources; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + + cres->ops->write(cres, subreq->start, &subreq->io_iter, + netfs_write_subrequest_terminated, subreq); +} + +static void netfs_write_to_cache_op_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); + + netfs_write_to_cache_op(subreq); +} + +/** + * netfs_queue_write_request - Queue a write request for attention + * @subreq: The write request to be queued + * + * Queue the specified write request for processing by a worker thread. We + * pass the caller's ref on the request to the worker thread. + */ +void netfs_queue_write_request(struct netfs_io_subrequest *subreq) +{ + if (!queue_work(system_unbound_wq, &subreq->work)) + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip); +} +EXPORT_SYMBOL(netfs_queue_write_request); + +/* + * Set up a op for writing to the cache. + */ +static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq) +{ + struct netfs_cache_resources *cres = &wreq->cache_resources; + struct netfs_io_subrequest *subreq; + struct netfs_inode *ctx = netfs_inode(wreq->inode); + struct fscache_cookie *cookie = netfs_i_cookie(ctx); + loff_t start = wreq->start; + size_t len = wreq->len; + int ret; + + if (!fscache_cookie_enabled(cookie)) { + clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags); + return; + } + + _debug("write to cache"); + ret = fscache_begin_write_operation(cres, cookie); + if (ret < 0) + return; + + ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len, + i_size_read(wreq->inode), true); + if (ret < 0) + return; + + subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len, + netfs_write_to_cache_op_worker); + if (!subreq) + return; + + netfs_write_to_cache_op(subreq); +} + +/* + * Begin the process of writing out a chunk of data. + * + * We are given a write request that holds a series of dirty regions and + * (partially) covers a sequence of folios, all of which are present. The + * pages must have been marked as writeback as appropriate. + * + * We need to perform the following steps: + * + * (1) If encrypting, create an output buffer and encrypt each block of the + * data into it, otherwise the output buffer will point to the original + * folios. + * + * (2) If the data is to be cached, set up a write op for the entire output + * buffer to the cache, if the cache wants to accept it. + * + * (3) If the data is to be uploaded (ie. not merely cached): + * + * (a) If the data is to be compressed, create a compression buffer and + * compress the data into it. + * + * (b) For each destination we want to upload to, set up write ops to write + * to that destination. We may need multiple writes if the data is not + * contiguous or the span exceeds wsize for a server. + */ +int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, + enum netfs_write_trace what) +{ + struct netfs_inode *ctx = netfs_inode(wreq->inode); + + _enter("R=%x %llx-%llx f=%lx", + wreq->debug_id, wreq->start, wreq->start + wreq->len - 1, + wreq->flags); + + trace_netfs_write(wreq, what); + if (wreq->len == 0 || wreq->iter.count == 0) { + pr_err("Zero-sized write [R=%x]\n", wreq->debug_id); + return -EIO; + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_begin(wreq->inode); + + wreq->io_iter = wreq->iter; + + /* ->outstanding > 0 carries a ref */ + netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); + atomic_set(&wreq->nr_outstanding, 1); + + /* Start the encryption/compression going. We can do that in the + * background whilst we generate a list of write ops that we want to + * perform. + */ + // TODO: Encrypt or compress the region as appropriate + + /* We need to write all of the region to the cache */ + if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + netfs_set_up_write_to_cache(wreq); + + /* However, we don't necessarily write all of the region to the server. + * Caching of reads is being managed this way also. + */ + if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) + ctx->ops->create_write_requests(wreq, wreq->start, wreq->len); + + if (atomic_dec_and_test(&wreq->nr_outstanding)) + netfs_write_terminated(wreq, false); + + if (!may_wait) + return -EIOCBQUEUED; + + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + return wreq->error; +} + +/* + * Begin a write operation for writing through the pagecache. + */ +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len) +{ + struct netfs_io_request *wreq; + struct file *file = iocb->ki_filp; + + wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len, + NETFS_WRITETHROUGH); + if (IS_ERR(wreq)) + return wreq; + + trace_netfs_write(wreq, netfs_write_trace_writethrough); + + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0); + wreq->io_iter = wreq->iter; + + /* ->outstanding > 0 carries a ref */ + netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); + atomic_set(&wreq->nr_outstanding, 1); + return wreq; +} + +static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final) +{ + struct netfs_inode *ictx = netfs_inode(wreq->inode); + unsigned long long start; + size_t len; + + if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) + return; + + start = wreq->start + wreq->submitted; + len = wreq->iter.count - wreq->submitted; + if (!final) { + len /= wreq->wsize; /* Round to number of maximum packets */ + len *= wreq->wsize; + } + + ictx->ops->create_write_requests(wreq, start, len); + wreq->submitted += len; +} + +/* + * Advance the state of the write operation used when writing through the + * pagecache. Data has been copied into the pagecache that we need to append + * to the request. If we've added more than wsize then we need to create a new + * subrequest. + */ +int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end) +{ + _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u", + wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end); + + wreq->iter.count += copied; + wreq->io_iter.count += copied; + if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize) + netfs_submit_writethrough(wreq, false); + + return wreq->error; +} + +/* + * End a write operation used when writing through the pagecache. + */ +int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb) +{ + int ret = -EIOCBQUEUED; + + _enter("ic=%zu sb=%zu ws=%u", + wreq->iter.count, wreq->submitted, wreq->wsize); + + if (wreq->submitted < wreq->io_iter.count) + netfs_submit_writethrough(wreq, true); + + if (atomic_dec_and_test(&wreq->nr_outstanding)) + netfs_write_terminated(wreq, false); + + if (is_sync_kiocb(iocb)) { + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + ret = wreq->error; + } + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 5510a7a14a40..deeba9f9dcf5 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -9,6 +9,8 @@ #include <linux/seq_file.h> #include "internal.h" +atomic_t netfs_n_rh_dio_read; +atomic_t netfs_n_rh_dio_write; atomic_t netfs_n_rh_readahead; atomic_t netfs_n_rh_readpage; atomic_t netfs_n_rh_rreq; @@ -27,32 +29,48 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_wh_wstream_conflict; +atomic_t netfs_n_wh_upload; +atomic_t netfs_n_wh_upload_done; +atomic_t netfs_n_wh_upload_failed; +atomic_t netfs_n_wh_write; +atomic_t netfs_n_wh_write_done; +atomic_t netfs_n_wh_write_failed; -void netfs_stats_show(struct seq_file *m) +int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n", + seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n", + atomic_read(&netfs_n_rh_dio_read), + atomic_read(&netfs_n_rh_dio_write), atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_readpage), atomic_read(&netfs_n_rh_write_begin), - atomic_read(&netfs_n_rh_write_zskip), - atomic_read(&netfs_n_rh_rreq), - atomic_read(&netfs_n_rh_sreq)); - seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n", + atomic_read(&netfs_n_rh_write_zskip)); + seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n", atomic_read(&netfs_n_rh_zero), atomic_read(&netfs_n_rh_short_read), atomic_read(&netfs_n_rh_write_zskip)); - seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n", + seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n", atomic_read(&netfs_n_rh_download), atomic_read(&netfs_n_rh_download_done), atomic_read(&netfs_n_rh_download_failed), atomic_read(&netfs_n_rh_download_instead)); - seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n", + seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n", atomic_read(&netfs_n_rh_read), atomic_read(&netfs_n_rh_read_done), atomic_read(&netfs_n_rh_read_failed)); - seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n", - atomic_read(&netfs_n_rh_write), - atomic_read(&netfs_n_rh_write_done), - atomic_read(&netfs_n_rh_write_failed)); + seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n", + atomic_read(&netfs_n_wh_upload), + atomic_read(&netfs_n_wh_upload_done), + atomic_read(&netfs_n_wh_upload_failed)); + seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n", + atomic_read(&netfs_n_wh_write), + atomic_read(&netfs_n_wh_write_done), + atomic_read(&netfs_n_wh_write_failed)); + seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n", + atomic_read(&netfs_n_rh_rreq), + atomic_read(&netfs_n_rh_sreq), + atomic_read(&netfs_n_wh_wstream_conflict)); + return fscache_stats_show(m); } EXPORT_SYMBOL(netfs_stats_show); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 01ac733a6320..f7e32d76e34d 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -169,8 +169,8 @@ config ROOT_NFS config NFS_FSCACHE bool "Provide NFS client caching support" - depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y - select NETFS_SUPPORT + depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y + select FSCACHE help Say Y here if you want NFS data to be cached locally on disc through the general filesystem cache manager diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index b4294a8aa2d4..f1eeb4914199 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -108,7 +108,7 @@ struct pnfs_block_dev { struct pnfs_block_dev *children; u64 chunk_size; - struct bdev_handle *bdev_handle; + struct file *bdev_file; u64 disk_offset; u64 pr_key; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index c97ebc42ec0f..93ef7f864980 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev) } else { if (dev->pr_registered) { const struct pr_ops *ops = - dev->bdev_handle->bdev->bd_disk->fops->pr_ops; + file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops; int error; - error = ops->pr_register(dev->bdev_handle->bdev, + error = ops->pr_register(file_bdev(dev->bdev_file), dev->pr_key, 0, false); if (error) pr_err("failed to unregister PR key.\n"); } - if (dev->bdev_handle) - bdev_release(dev->bdev_handle); + if (dev->bdev_file) + fput(dev->bdev_file); } } @@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, map->start = dev->start; map->len = dev->len; map->disk_offset = dev->disk_offset; - map->bdev = dev->bdev_handle->bdev; + map->bdev = file_bdev(dev->bdev_file); return true; } @@ -236,26 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct bdev_handle *bdev_handle; + struct file *bdev_file; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle)); - return PTR_ERR(bdev_handle); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev_file)); + return PTR_ERR(bdev_file); } - d->bdev_handle = bdev_handle; - d->len = bdev_nr_bytes(bdev_handle->bdev); + d->bdev_file = bdev_file; + d->len = bdev_nr_bytes(file_bdev(bdev_file)); d->map = bl_map_simple; printk(KERN_INFO "pNFS: using block device %s\n", - bdev_handle->bdev->bd_disk->disk_name); + file_bdev(bdev_file)->bd_disk->disk_name); return 0; } @@ -300,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v) } } -static struct bdev_handle * +static struct file * bl_open_path(struct pnfs_block_volume *v, const char *prefix) { - struct bdev_handle *bdev_handle; + struct file *bdev_file; const char *devname; devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN", @@ -311,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix) if (!devname) return ERR_PTR(-ENOMEM); - bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(bdev_handle)); + devname, PTR_ERR(bdev_file)); } kfree(devname); - return bdev_handle; + return bdev_file; } static int @@ -327,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct bdev_handle *bdev_handle; + struct file *bdev_file; const struct pr_ops *ops; int error; @@ -340,14 +340,14 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, * On other distributions like Debian, the default SCSI by-id path will * point to the dm-multipath device if one exists. */ - bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x"); - if (IS_ERR(bdev_handle)) - bdev_handle = bl_open_path(v, "wwn-0x"); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); - d->bdev_handle = bdev_handle; - - d->len = bdev_nr_bytes(d->bdev_handle->bdev); + bdev_file = bl_open_path(v, "dm-uuid-mpath-0x"); + if (IS_ERR(bdev_file)) + bdev_file = bl_open_path(v, "wwn-0x"); + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); + d->bdev_file = bdev_file; + + d->len = bdev_nr_bytes(file_bdev(d->bdev_file)); d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; @@ -355,20 +355,20 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return -ENODEV; pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", - d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key); + file_bdev(d->bdev_file)->bd_disk->disk_name, d->pr_key); - ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops; + ops = file_bdev(d->bdev_file)->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: block device %s does not support reservations.", - d->bdev_handle->bdev->bd_disk->disk_name); + file_bdev(d->bdev_file)->bd_disk->disk_name); error = -EINVAL; goto out_blkdev_put; } - error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true); + error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true); if (error) { pr_err("pNFS: failed to register key for block device %s.", - d->bdev_handle->bdev->bd_disk->disk_name); + file_bdev(d->bdev_file)->bd_disk->disk_name); goto out_blkdev_put; } @@ -376,7 +376,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - bdev_release(d->bdev_handle); + fput(d->bdev_file); return error; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 44eca51b2808..fbdc9ca80f71 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -246,7 +246,7 @@ void nfs_free_client(struct nfs_client *clp) put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); - kfree(clp); + kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -1006,6 +1006,14 @@ struct nfs_server *nfs_alloc_server(void) } EXPORT_SYMBOL_GPL(nfs_alloc_server); +static void delayed_free(struct rcu_head *p) +{ + struct nfs_server *server = container_of(p, struct nfs_server, rcu); + + nfs_free_iostats(server->io_stats); + kfree(server); +} + /* * Free up a server record */ @@ -1031,10 +1039,9 @@ void nfs_free_server(struct nfs_server *server) ida_destroy(&server->lockowner_id); ida_destroy(&server->openowner_id); - nfs_free_iostats(server->io_stats); put_cred(server->cred); - kfree(server); nfs_release_automount_timer(); + call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index fa1a14def45c..d4a42ce0c7e3 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -156,8 +156,8 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state list = &flctx->flc_posix; spin_lock(&flctx->flc_lock); restart: - list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file)->state != state) + for_each_file_lock(fl, list) { + if (nfs_file_open_context(fl->c.flc_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c8ecbe999059..ac505671efbd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1431,9 +1431,9 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry) static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) { struct inode *inode = d_inode(dentry); - struct inode *dir = d_inode(dentry->d_parent); + struct inode *dir = d_inode_rcu(dentry->d_parent); - if (!nfs_verify_change_attribute(dir, verf)) + if (!dir || !nfs_verify_change_attribute(dir, verf)) return; if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) nfs_set_verifier_delegated(&verf); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8577ccf621f5..407c6e15afe2 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -720,15 +720,15 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; - unsigned int saved_type = fl->fl_type; + unsigned int saved_type = fl->c.flc_type; /* Try local locking first */ posix_test_lock(filp, fl); - if (fl->fl_type != F_UNLCK) { + if (fl->c.flc_type != F_UNLCK) { /* found a conflict */ goto out; } - fl->fl_type = saved_type; + fl->c.flc_type = saved_type; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) goto out_noconflict; @@ -740,7 +740,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) out: return status; out_noconflict: - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; goto out; } @@ -765,7 +765,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. */ - if (status < 0 && !(fl->fl_flags & FL_CLOSE)) + if (status < 0 && !(fl->c.flc_flags & FL_CLOSE)) return status; } @@ -832,12 +832,12 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) int is_local = 0; dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n", - filp, fl->fl_type, fl->fl_flags, + filp, fl->c.flc_type, fl->c.flc_flags, (long long)fl->fl_start, (long long)fl->fl_end); nfs_inc_stats(inode, NFSIOS_VFSLOCK); - if (fl->fl_flags & FL_RECLAIM) + if (fl->c.flc_flags & FL_RECLAIM) return -ENOGRACE; if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) @@ -851,7 +851,7 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) if (IS_GETLK(cmd)) ret = do_getlk(filp, cmd, fl, is_local); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) ret = do_unlk(filp, cmd, fl, is_local); else ret = do_setlk(filp, cmd, fl, is_local); @@ -869,16 +869,16 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) int is_local = 0; dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n", - filp, fl->fl_type, fl->fl_flags); + filp, fl->c.flc_type, fl->c.flc_flags); - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; /* We're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) return do_unlk(filp, cmd, fl, is_local); return do_setlk(filp, cmd, fl, is_local); } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index b05717fe0d4e..2d1bfee225c3 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -274,12 +274,6 @@ static void nfs_netfs_free_request(struct netfs_io_request *rreq) put_nfs_open_context(rreq->netfs_priv); } -static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq) -{ - return fscache_begin_read_operation(&rreq->cache_resources, - netfs_i_cookie(netfs_inode(rreq->inode))); -} - static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) { struct nfs_netfs_io_data *netfs; @@ -387,7 +381,6 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) const struct netfs_request_ops nfs_netfs_ops = { .init_request = nfs_netfs_init_request, .free_request = nfs_netfs_free_request, - .begin_cache_operation = nfs_netfs_begin_cache_operation, .issue_read = nfs_netfs_issue_read, .clamp_length = nfs_netfs_clamp_length }; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 5407ab8c8783..e3cb4923316b 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -80,7 +80,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) } static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { - netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops); + netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2de66e4e8280..cbbe3f0193b8 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -963,7 +963,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) struct nfs_open_context *ctx = nfs_file_open_context(filp); int status; - if (fl->fl_flags & FL_CLOSE) { + if (fl->c.flc_flags & FL_CLOSE) { l_ctx = nfs_get_lock_context(ctx); if (IS_ERR(l_ctx)) l_ctx = NULL; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 581698f1b7b2..6ff41ceb9f1c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -330,7 +330,7 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *deleg_stateid, fmode_t fmode); extern int nfs4_proc_setlease(struct file *file, int arg, - struct file_lock **lease, void **priv); + struct file_lease **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern void nfs4_update_changeattr(struct inode *dir, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e238abc78a13..1cd9652f3c28 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -439,7 +439,7 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease, +static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { return nfs4_proc_setlease(file, arg, lease, priv); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 23819a756508..815996cb27fc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6800,7 +6800,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); switch (status) { case 0: - request->fl_type = F_UNLCK; + request->c.flc_type = F_UNLCK; break; case -NFS4ERR_DENIED: status = 0; @@ -7018,8 +7018,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, /* Ensure this is an unlock - when canceling a lock, the * canceled lock is passed in, and it won't be an unlock. */ - fl->fl_type = F_UNLCK; - if (fl->fl_flags & FL_CLOSE) + fl->c.flc_type = F_UNLCK; + if (fl->c.flc_flags & FL_CLOSE) set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); @@ -7045,11 +7045,11 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * struct rpc_task *task; struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); int status = 0; - unsigned char fl_flags = request->fl_flags; + unsigned char saved_flags = request->c.flc_flags; status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ - request->fl_flags |= FL_EXISTS; + request->c.flc_flags |= FL_EXISTS; /* Exclude nfs_delegation_claim_locks() */ mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ @@ -7073,14 +7073,16 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = -ENOMEM; if (IS_ERR(seqid)) goto out; - task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); + task = nfs4_do_unlck(request, + nfs_file_open_context(request->c.flc_file), + lsp, seqid); status = PTR_ERR(task); if (IS_ERR(task)) goto out; status = rpc_wait_for_completion_task(task); rpc_put_task(task); out: - request->fl_flags = fl_flags; + request->c.flc_flags = saved_flags; trace_nfs4_unlock(request, state, F_SETLK, status); return status; } @@ -7191,7 +7193,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), data->timestamp); if (data->arg.new_lock && !data->cancelled) { - data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); + data->fl.c.flc_flags &= ~(FL_SLEEP | FL_ACCESS); if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) goto out_restart; } @@ -7292,7 +7294,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; - data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), + data = nfs4_alloc_lockdata(fl, + nfs_file_open_context(fl->c.flc_file), fl->fl_u.nfs4_fl.owner, GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -7398,10 +7401,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock { struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs4_state_owner *sp = state->owner; - unsigned char fl_flags = request->fl_flags; + unsigned char flags = request->c.flc_flags; int status; - request->fl_flags |= FL_ACCESS; + request->c.flc_flags |= FL_ACCESS; status = locks_lock_inode_wait(state->inode, request); if (status < 0) goto out; @@ -7410,7 +7413,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ - request->fl_flags = fl_flags & ~FL_SLEEP; + request->c.flc_flags = flags & ~FL_SLEEP; status = locks_lock_inode_wait(state->inode, request); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); @@ -7420,7 +7423,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock mutex_unlock(&sp->so_delegreturn_mutex); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); out: - request->fl_flags = fl_flags; + request->c.flc_flags = flags; return status; } @@ -7562,7 +7565,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) return -EINVAL; - if (request->fl_type == F_UNLCK) { + if (lock_is_unlock(request)) { if (state != NULL) return nfs4_proc_unlck(state, cmd, request); return 0; @@ -7571,7 +7574,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; - if ((request->fl_flags & FL_POSIX) && + if ((request->c.flc_flags & FL_POSIX) && !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; @@ -7579,7 +7582,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) * Don't rely on the VFS having checked the file open mode, * since it won't do this for flock() locks. */ - switch (request->fl_type) { + switch (request->c.flc_type) { case F_RDLCK: if (!(filp->f_mode & FMODE_READ)) return -EBADF; @@ -7601,7 +7604,7 @@ static int nfs4_delete_lease(struct file *file, void **priv) return generic_setlease(file, F_UNLCK, NULL, priv); } -static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease, +static int nfs4_add_lease(struct file *file, int arg, struct file_lease **lease, void **priv) { struct inode *inode = file_inode(file); @@ -7619,7 +7622,7 @@ static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease, return -EAGAIN; } -int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, +int nfs4_proc_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { switch (arg) { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9a5d911a7edc..8cfabdbda336 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -847,15 +847,15 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) */ static struct nfs4_lock_state * __nfs4_find_lock_state(struct nfs4_state *state, - fl_owner_t fl_owner, fl_owner_t fl_owner2) + fl_owner_t owner, fl_owner_t owner2) { struct nfs4_lock_state *pos, *ret = NULL; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner == fl_owner) { + if (pos->ls_owner == owner) { ret = pos; break; } - if (pos->ls_owner == fl_owner2) + if (pos->ls_owner == owner2) ret = pos; } if (ret) @@ -868,7 +868,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t owner) { struct nfs4_lock_state *lsp; struct nfs_server *server = state->owner->so_server; @@ -879,7 +879,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f nfs4_init_seqid_counter(&lsp->ls_seqid); refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; - lsp->ls_owner = fl_owner; + lsp->ls_owner = owner; lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT); if (lsp->ls_seqid.owner_id < 0) goto out_free; @@ -980,7 +980,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) if (fl->fl_ops != NULL) return 0; - lsp = nfs4_get_lock_state(state, fl->fl_owner); + lsp = nfs4_get_lock_state(state, fl->c.flc_owner); if (lsp == NULL) return -ENOMEM; fl->fl_u.nfs4_fl.owner = lsp; @@ -993,7 +993,7 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, const struct nfs_lock_context *l_ctx) { struct nfs4_lock_state *lsp; - fl_owner_t fl_owner, fl_flock_owner; + fl_owner_t owner, fl_flock_owner; int ret = -ENOENT; if (l_ctx == NULL) @@ -1002,11 +1002,11 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) goto out; - fl_owner = l_ctx->lockowner; + owner = l_ctx->lockowner; fl_flock_owner = l_ctx->open_context->flock_owner; spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner); + lsp = __nfs4_find_lock_state(state, owner, fl_flock_owner); if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) ret = -EIO; else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { @@ -1529,8 +1529,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ down_write(&nfsi->rwsem); spin_lock(&flctx->flc_lock); restart: - list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file)->state != state) + for_each_file_lock(fl, list) { + if (nfs_file_open_context(fl->c.flc_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = ops->recover_lock(state, fl); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index d27919d7241d..fd7cb15b08b2 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -699,7 +699,7 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __entry->error = error < 0 ? -error : 0; __entry->cmd = cmd; - __entry->type = request->fl_type; + __entry->type = request->c.flc_type; __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; @@ -771,7 +771,7 @@ TRACE_EVENT(nfs4_set_lock, __entry->error = error < 0 ? -error : 0; __entry->cmd = cmd; - __entry->type = request->fl_type; + __entry->type = request->c.flc_type; __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 69406e60f391..1416099dfcd1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1305,7 +1305,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct static inline int nfs4_lock_type(struct file_lock *fl, int block) { - if (fl->fl_type == F_RDLCK) + if (lock_is_read(fl)) return block ? NFS4_READW_LT : NFS4_READ_LT; return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT; } @@ -5052,10 +5052,10 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_end = fl->fl_start + (loff_t)length - 1; if (length == ~(uint64_t)0) fl->fl_end = OFFSET_MAX; - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; if (type & 1) - fl->fl_type = F_RDLCK; - fl->fl_pid = 0; + fl->c.flc_type = F_RDLCK; + fl->c.flc_pid = 0; } p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bb79d3a886ae..84bb85264572 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1301,7 +1301,7 @@ static bool is_whole_file_wrlock(struct file_lock *fl) { return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX && - fl->fl_type == F_WRLCK; + lock_is_write(fl); } /* If we know the page is up to date, and we're not using byte range locks (or @@ -1335,13 +1335,13 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio, spin_lock(&flctx->flc_lock); if (!list_empty(&flctx->flc_posix)) { fl = list_first_entry(&flctx->flc_posix, struct file_lock, - fl_list); + c.flc_list); if (is_whole_file_wrlock(fl)) ret = 1; } else if (!list_empty(&flctx->flc_flock)) { fl = list_first_entry(&flctx->flc_flock, struct file_lock, - fl_list); - if (fl->fl_type == F_WRLCK) + c.flc_list); + if (lock_is_write(fl)) ret = 1; } spin_unlock(&flctx->flc_lock); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9cb7f0c33df5..b86d8494052c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -662,8 +662,8 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, struct file_lock *fl = data; /* Only close files for F_SETLEASE leases */ - if (fl->fl_flags & FL_LEASE) - nfsd_file_close_inode(file_inode(fl->fl_file)); + if (fl->c.flc_flags & FL_LEASE) + nfsd_file_close_inode(file_inode(fl->c.flc_file)); return 0; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 926c29879c6a..32d23ef3e5de 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -674,7 +674,7 @@ static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req, const struct nfsd4_callback *cb = data; const struct nfsd4_blocked_lock *nbl = container_of(cb, struct nfsd4_blocked_lock, nbl_cb); - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner; + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.c.flc_owner; struct nfs4_cb_compound_hdr hdr = { .ident = 0, .minorversion = cb->cb_clp->cl_minorversion, diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 5e8096bc5eaa..4c0d00bdfbb1 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -25,7 +25,7 @@ static struct kmem_cache *nfs4_layout_cache; static struct kmem_cache *nfs4_layout_stateid_cache; static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; -static const struct lock_manager_operations nfsd4_layouts_lm_ops; +static const struct lease_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { #ifdef CONFIG_NFSD_FLEXFILELAYOUT @@ -170,7 +170,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) spin_unlock(&fp->fi_lock); if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) - vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); + kernel_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); nfsd_file_put(ls->ls_file); if (ls->ls_recalled) @@ -182,27 +182,26 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) static int nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) { - struct file_lock *fl; + struct file_lease *fl; int status; if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) return 0; - fl = locks_alloc_lock(); + fl = locks_alloc_lease(); if (!fl) return -ENOMEM; - locks_init_lock(fl); + locks_init_lease(fl); fl->fl_lmops = &nfsd4_layouts_lm_ops; - fl->fl_flags = FL_LAYOUT; - fl->fl_type = F_RDLCK; - fl->fl_end = OFFSET_MAX; - fl->fl_owner = ls; - fl->fl_pid = current->tgid; - fl->fl_file = ls->ls_file->nf_file; - - status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); + fl->c.flc_flags = FL_LAYOUT; + fl->c.flc_type = F_RDLCK; + fl->c.flc_owner = ls; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = ls->ls_file->nf_file; + + status = kernel_setlease(fl->c.flc_file, fl->c.flc_type, &fl, NULL); if (status) { - locks_free_lock(fl); + locks_free_lease(fl); return status; } BUG_ON(fl != NULL); @@ -723,7 +722,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = { }; static bool -nfsd4_layout_lm_break(struct file_lock *fl) +nfsd4_layout_lm_break(struct file_lease *fl) { /* * We don't want the locks code to timeout the lease for us; @@ -731,19 +730,19 @@ nfsd4_layout_lm_break(struct file_lock *fl) * in time: */ fl->fl_break_time = 0; - nfsd4_recall_file_layout(fl->fl_owner); + nfsd4_recall_file_layout(fl->c.flc_owner); return false; } static int -nfsd4_layout_lm_change(struct file_lock *onlist, int arg, +nfsd4_layout_lm_change(struct file_lease *onlist, int arg, struct list_head *dispose) { BUG_ON(!(arg & F_UNLCK)); return lease_modify(onlist, arg, dispose); } -static const struct lock_manager_operations nfsd4_layouts_lm_ops = { +static const struct lease_manager_operations nfsd4_layouts_lm_ops = { .lm_break = nfsd4_layout_lm_break, .lm_change = nfsd4_layout_lm_change, }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2fa54cfd4882..9257425cbd1a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1249,7 +1249,7 @@ static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) WARN_ON_ONCE(!fp->fi_delegees); - vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); + kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } @@ -4922,9 +4922,9 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) /* Called from break_lease() with flc_lock held. */ static bool -nfsd_break_deleg_cb(struct file_lock *fl) +nfsd_break_deleg_cb(struct file_lease *fl) { - struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; + struct nfs4_delegation *dp = (struct nfs4_delegation *) fl->c.flc_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; struct nfs4_client *clp = dp->dl_stid.sc_client; struct nfsd_net *nn; @@ -4945,10 +4945,8 @@ nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - spin_lock(&fp->fi_lock); fp->fi_had_conflict = true; nfsd_break_one_deleg(dp); - spin_unlock(&fp->fi_lock); return false; } @@ -4960,9 +4958,9 @@ nfsd_break_deleg_cb(struct file_lock *fl) * %true: Lease conflict was resolved * %false: Lease conflict was not resolved. */ -static bool nfsd_breaker_owns_lease(struct file_lock *fl) +static bool nfsd_breaker_owns_lease(struct file_lease *fl) { - struct nfs4_delegation *dl = fl->fl_owner; + struct nfs4_delegation *dl = fl->c.flc_owner; struct svc_rqst *rqst; struct nfs4_client *clp; @@ -4977,10 +4975,10 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl) } static int -nfsd_change_deleg_cb(struct file_lock *onlist, int arg, +nfsd_change_deleg_cb(struct file_lease *onlist, int arg, struct list_head *dispose) { - struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner; + struct nfs4_delegation *dp = (struct nfs4_delegation *) onlist->c.flc_owner; struct nfs4_client *clp = dp->dl_stid.sc_client; if (arg & F_UNLCK) { @@ -4991,7 +4989,7 @@ nfsd_change_deleg_cb(struct file_lock *onlist, int arg, return -EAGAIN; } -static const struct lock_manager_operations nfsd_lease_mng_ops = { +static const struct lease_manager_operations nfsd_lease_mng_ops = { .lm_breaker_owns_lease = nfsd_breaker_owns_lease, .lm_break = nfsd_break_deleg_cb, .lm_change = nfsd_change_deleg_cb, @@ -5331,21 +5329,20 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } -static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, +static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag) { - struct file_lock *fl; + struct file_lease *fl; - fl = locks_alloc_lock(); + fl = locks_alloc_lease(); if (!fl) return NULL; fl->fl_lmops = &nfsd_lease_mng_ops; - fl->fl_flags = FL_DELEG; - fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; - fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)dp; - fl->fl_pid = current->tgid; - fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; + fl->c.flc_flags = FL_DELEG; + fl->c.flc_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; + fl->c.flc_owner = (fl_owner_t)dp; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; return fl; } @@ -5463,7 +5460,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; struct nfsd_file *nf = NULL; - struct file_lock *fl; + struct file_lease *fl; u32 dl_type; /* @@ -5533,9 +5530,10 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (!fl) goto out_clnt_odstate; - status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); + status = kernel_setlease(fp->fi_deleg_file->nf_file, + fl->c.flc_type, &fl, NULL); if (fl) - locks_free_lock(fl); + locks_free_lease(fl); if (status) goto out_clnt_odstate; @@ -5557,12 +5555,13 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (status) goto out_unlock; + status = -EAGAIN; + if (fp->fi_had_conflict) + goto out_unlock; + spin_lock(&state_lock); spin_lock(&fp->fi_lock); - if (fp->fi_had_conflict) - status = -EAGAIN; - else - status = hash_delegation_locked(dp, fp); + status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); @@ -5571,7 +5570,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, return dp; out_unlock: - vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); + kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); @@ -7149,7 +7148,7 @@ nfsd4_lm_put_owner(fl_owner_t owner) static bool nfsd4_lm_lock_expirable(struct file_lock *cfl) { - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner; + struct nfs4_lockowner *lo = (struct nfs4_lockowner *) cfl->c.flc_owner; struct nfs4_client *clp = lo->lo_owner.so_client; struct nfsd_net *nn; @@ -7171,7 +7170,7 @@ nfsd4_lm_expire_lock(void) static void nfsd4_lm_notify(struct file_lock *fl) { - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + struct nfs4_lockowner *lo = (struct nfs4_lockowner *) fl->c.flc_owner; struct net *net = lo->lo_owner.so_client->net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfsd4_blocked_lock *nbl = container_of(fl, @@ -7208,7 +7207,7 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) struct nfs4_lockowner *lo; if (fl->fl_lmops == &nfsd_posix_mng_ops) { - lo = (struct nfs4_lockowner *) fl->fl_owner; + lo = (struct nfs4_lockowner *) fl->c.flc_owner; xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner, GFP_KERNEL); if (!deny->ld_owner.data) @@ -7227,7 +7226,7 @@ nevermind: if (fl->fl_end != NFS4_MAX_UINT64) deny->ld_length = fl->fl_end - fl->fl_start + 1; deny->ld_type = NFS4_READ_LT; - if (fl->fl_type != F_RDLCK) + if (fl->c.flc_type != F_RDLCK) deny->ld_type = NFS4_WRITE_LT; } @@ -7493,8 +7492,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int lkflg; int err; bool new = false; - unsigned char fl_type; - unsigned int fl_flags = FL_POSIX; + unsigned char type; + unsigned int flags = FL_POSIX; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -7557,14 +7556,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; if (lock->lk_reclaim) - fl_flags |= FL_RECLAIM; + flags |= FL_RECLAIM; fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: if (nfsd4_has_session(cstate) || exportfs_lock_op_is_async(sb->s_export_op)) - fl_flags |= FL_SLEEP; + flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); @@ -7572,12 +7571,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); - fl_type = F_RDLCK; + type = F_RDLCK; break; case NFS4_WRITEW_LT: if (nfsd4_has_session(cstate) || exportfs_lock_op_is_async(sb->s_export_op)) - fl_flags |= FL_SLEEP; + flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); @@ -7585,7 +7584,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); - fl_type = F_WRLCK; + type = F_WRLCK; break; default: status = nfserr_inval; @@ -7605,7 +7604,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * on those filesystems: */ if (!exportfs_lock_op_is_async(sb->s_export_op)) - fl_flags &= ~FL_SLEEP; + flags &= ~FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { @@ -7615,11 +7614,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } file_lock = &nbl->nbl_lock; - file_lock->fl_type = fl_type; - file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); - file_lock->fl_pid = current->tgid; - file_lock->fl_file = nf->nf_file; - file_lock->fl_flags = fl_flags; + file_lock->c.flc_type = type; + file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); + file_lock->c.flc_pid = current->tgid; + file_lock->c.flc_file = nf->nf_file; + file_lock->c.flc_flags = flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); @@ -7632,7 +7631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - if (fl_flags & FL_SLEEP) { + if (flags & FL_SLEEP) { nbl->nbl_time = ktime_get_boottime_seconds(); spin_lock(&nn->blocked_locks_lock); list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); @@ -7669,7 +7668,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: if (nbl) { /* dequeue it if we queued it before */ - if (fl_flags & FL_SLEEP) { + if (flags & FL_SLEEP) { spin_lock(&nn->blocked_locks_lock); if (!list_empty(&nbl->nbl_list) && !list_empty(&nbl->nbl_lru)) { @@ -7737,9 +7736,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (err) goto out; - lock->fl_file = nf->nf_file; + lock->c.flc_file = nf->nf_file; err = nfserrno(vfs_test_lock(nf->nf_file, lock)); - lock->fl_file = NULL; + lock->c.flc_file = NULL; out: inode_unlock(inode); nfsd_file_put(nf); @@ -7784,11 +7783,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (lockt->lt_type) { case NFS4_READ_LT: case NFS4_READW_LT: - file_lock->fl_type = F_RDLCK; + file_lock->c.flc_type = F_RDLCK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: - file_lock->fl_type = F_WRLCK; + file_lock->c.flc_type = F_WRLCK; break; default: dprintk("NFSD: nfs4_lockt: bad lock type!\n"); @@ -7798,9 +7797,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, lo = find_lockowner_str(cstate->clp, &lockt->lt_owner); if (lo) - file_lock->fl_owner = (fl_owner_t)lo; - file_lock->fl_pid = current->tgid; - file_lock->fl_flags = FL_POSIX; + file_lock->c.flc_owner = (fl_owner_t)lo; + file_lock->c.flc_pid = current->tgid; + file_lock->c.flc_flags = FL_POSIX; file_lock->fl_start = lockt->lt_offset; file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); @@ -7811,7 +7810,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - if (file_lock->fl_type != F_UNLCK) { + if (file_lock->c.flc_type != F_UNLCK) { status = nfserr_denied; nfs4_set_lock_denied(file_lock, &lockt->lt_denied); } @@ -7867,11 +7866,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto put_file; } - file_lock->fl_type = F_UNLCK; - file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); - file_lock->fl_pid = current->tgid; - file_lock->fl_file = nf->nf_file; - file_lock->fl_flags = FL_POSIX; + file_lock->c.flc_type = F_UNLCK; + file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); + file_lock->c.flc_pid = current->tgid; + file_lock->c.flc_file = nf->nf_file; + file_lock->c.flc_flags = FL_POSIX; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = locku->lu_offset; @@ -7911,14 +7910,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct nfsd_file *nf = find_any_file(fp); + struct nfsd_file *nf; struct inode *inode; struct file_lock_context *flctx; + spin_lock(&fp->fi_lock); + nf = find_any_file_locked(fp); if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); - return status; + goto out; } inode = file_inode(nf->nf_file); @@ -7926,15 +7927,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); - list_for_each_entry(fl, &flctx->flc_posix, fl_list) { - if (fl->fl_owner == (fl_owner_t)lowner) { + for_each_file_lock(fl, &flctx->flc_posix) { + if (fl->c.flc_owner == (fl_owner_t)lowner) { status = true; break; } } spin_unlock(&flctx->flc_lock); } - nfsd_file_put(nf); +out: + spin_unlock(&fp->fi_lock); return status; } @@ -7944,10 +7946,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) * @cstate: NFSv4 COMPOUND state * @u: RELEASE_LOCKOWNER arguments * - * The lockowner's so_count is bumped when a lock record is added - * or when copying a conflicting lock. The latter case is brief, - * but can lead to fleeting false positives when looking for - * locks-in-use. + * Check if theree are any locks still held and if not - free the lockowner + * and any lock state that is owned. * * Return values: * %nfs_ok: lockowner released or not found @@ -7983,10 +7983,13 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, spin_unlock(&clp->cl_lock); return nfs_ok; } - if (atomic_read(&lo->lo_owner.so_count) != 2) { - spin_unlock(&clp->cl_lock); - nfs4_put_stateowner(&lo->lo_owner); - return nfserr_locks_held; + + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + if (check_for_locks(stp->st_stid.sc_file, lo)) { + spin_unlock(&clp->cl_lock); + nfs4_put_stateowner(&lo->lo_owner); + return nfserr_locks_held; + } } unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { @@ -8448,15 +8451,17 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) { __be32 status; struct file_lock_context *ctx; - struct file_lock *fl; + struct file_lease *fl; struct nfs4_delegation *dp; ctx = locks_inode_context(inode); if (!ctx) return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_flags == FL_LAYOUT) + for_each_file_lock(fl, &ctx->flc_lease) { + unsigned char type = fl->c.flc_type; + + if (fl->c.flc_flags == FL_LAYOUT) continue; if (fl->fl_lmops != &nfsd_lease_mng_ops) { /* @@ -8464,12 +8469,12 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) * we are done; there isn't any write delegation * on this inode */ - if (fl->fl_type == F_RDLCK) + if (type == F_RDLCK) break; goto break_lease; } - if (fl->fl_type == F_WRLCK) { - dp = fl->fl_owner; + if (type == F_WRLCK) { + dp = fl->c.flc_owner; if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { spin_unlock(&ctx->flc_lock); return 0; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index bec33b89a075..0e3fc5ba33c7 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -107,7 +107,13 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - folio_wait_stable(folio); + /* + * Since checksumming including data blocks is performed to determine + * the validity of the log to be written and used for recovery, it is + * necessary to wait for writeback to finish here, regardless of the + * stable write requirement of the backing device. + */ + folio_wait_writeback(folio); out: sb_end_pagefault(inode->i_sb); return vmf_fs_error(ret); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 0955b657938f..a9b8d77c8c1d 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, - struct page *page) + loff_t pos, struct page *page) { struct buffer_head *bh_org; + size_t from = pos & ~PAGE_MASK; void *kaddr; bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); @@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, return -EIO; kaddr = kmap_atomic(page); - memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + memcpy(kaddr + from, bh_org->b_data, bh_org->b_size); kunmap_atomic(kaddr); brelse(bh_org); return 0; @@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - err = nilfs_recovery_copy_block(nilfs, rb, page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, page); if (unlikely(err)) goto failed_page; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 2590a0860eab..2bfb08052d39 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - set_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { if (bh->b_folio != bd_folio) { folio_lock(bd_folio); @@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) } break; } + set_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_begin_folio_io(fs_folio); fs_folio = bh->b_folio; @@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { clear_buffer_uptodate(bh); if (bh->b_folio != bd_folio) { @@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) } break; } + clear_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, err); fs_folio = bh->b_folio; @@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) BIT(BH_Delay) | BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Redirected)); - set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh == segbuf->sb_super_root) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); if (bh->b_folio != bd_folio) { folio_end_writeback(bd_folio); bd_folio = bh->b_folio; @@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) update_sr = true; break; } + set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, 0); fs_folio = bh->b_folio; diff --git a/fs/nsfs.c b/fs/nsfs.c index 34e1e3e36733..7aaafb5cb9fc 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -27,26 +27,17 @@ static const struct file_operations ns_file_operations = { static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = d_inode(dentry); - const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + struct ns_common *ns = inode->i_private; + const struct proc_ns_operations *ns_ops = ns->ops; return dynamic_dname(buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); } -static void ns_prune_dentry(struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - if (inode) { - struct ns_common *ns = inode->i_private; - atomic_long_set(&ns->stashed, 0); - } -} - -const struct dentry_operations ns_dentry_operations = -{ - .d_prune = ns_prune_dentry, +const struct dentry_operations ns_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = ns_dname, + .d_prune = stashed_dentry_prune, }; static void nsfs_evict(struct inode *inode) @@ -56,67 +47,16 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } -static int __ns_get_path(struct path *path, struct ns_common *ns) -{ - struct vfsmount *mnt = nsfs_mnt; - struct dentry *dentry; - struct inode *inode; - unsigned long d; - - rcu_read_lock(); - d = atomic_long_read(&ns->stashed); - if (!d) - goto slow; - dentry = (struct dentry *)d; - if (!lockref_get_not_dead(&dentry->d_lockref)) - goto slow; - rcu_read_unlock(); - ns->ops->put(ns); -got_it: - path->mnt = mntget(mnt); - path->dentry = dentry; - return 0; -slow: - rcu_read_unlock(); - inode = new_inode_pseudo(mnt->mnt_sb); - if (!inode) { - ns->ops->put(ns); - return -ENOMEM; - } - inode->i_ino = ns->inum; - simple_inode_init_ts(inode); - inode->i_flags |= S_IMMUTABLE; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - inode->i_private = ns; - - dentry = d_make_root(inode); /* not the normal use, but... */ - if (!dentry) - return -ENOMEM; - dentry->d_fsdata = (void *)ns->ops; - d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); - if (d) { - d_delete(dentry); /* make sure ->d_prune() does nothing */ - dput(dentry); - cpu_relax(); - return -EAGAIN; - } - goto got_it; -} - int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, void *private_data) { - int ret; + struct ns_common *ns; - do { - struct ns_common *ns = ns_get_cb(private_data); - if (!ns) - return -ENOENT; - ret = __ns_get_path(path, ns); - } while (ret == -EAGAIN); + ns = ns_get_cb(private_data); + if (!ns) + return -ENOENT; - return ret; + return path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt, ns, path); } struct ns_get_path_task_args { @@ -146,6 +86,7 @@ int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { struct path path = {}; + struct ns_common *relative; struct file *f; int err; int fd; @@ -154,19 +95,15 @@ int open_related_ns(struct ns_common *ns, if (fd < 0) return fd; - do { - struct ns_common *relative; - - relative = get_ns(ns); - if (IS_ERR(relative)) { - put_unused_fd(fd); - return PTR_ERR(relative); - } - - err = __ns_get_path(&path, relative); - } while (err == -EAGAIN); + relative = get_ns(ns); + if (IS_ERR(relative)) { + put_unused_fd(fd); + return PTR_ERR(relative); + } - if (err) { + err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt, + relative, &path); + if (err < 0) { put_unused_fd(fd); return err; } @@ -249,7 +186,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino) static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + const struct ns_common *ns = inode->i_private; + const struct proc_ns_operations *ns_ops = ns->ops; seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); return 0; @@ -261,6 +199,24 @@ static const struct super_operations nsfs_ops = { .show_path = nsfs_show_path, }; +static void nsfs_init_inode(struct inode *inode, void *data) +{ + inode->i_private = data; + inode->i_mode |= S_IRUGO; + inode->i_fop = &ns_file_operations; +} + +static void nsfs_put_data(void *data) +{ + struct ns_common *ns = data; + ns->ops->put(ns); +} + +static const struct stashed_operations nsfs_stashed_ops = { + .init_inode = nsfs_init_inode, + .put_data = nsfs_put_data, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); @@ -268,6 +224,7 @@ static int nsfs_init_fs_context(struct fs_context *fc) return -ENOMEM; ctx->ops = &nsfs_ops; ctx->dops = &ns_dentry_operations; + fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; } diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig deleted file mode 100644 index 7b2509741735..000000000000 --- a/fs/ntfs/Kconfig +++ /dev/null @@ -1,81 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -config NTFS_FS - tristate "NTFS file system support" - select BUFFER_HEAD - select NLS - help - NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. - - Saying Y or M here enables read support. There is partial, but - safe, write support available. For write support you must also - say Y to "NTFS write support" below. - - There are also a number of user-space tools available, called - ntfsprogs. These include ntfsundelete and ntfsresize, that work - without NTFS support enabled in the kernel. - - This is a rewrite from scratch of Linux NTFS support and replaced - the old NTFS code starting with Linux 2.5.11. A backport to - the Linux 2.4 kernel series is separately available as a patch - from the project web site. - - For more information see <file:Documentation/filesystems/ntfs.rst> - and <http://www.linux-ntfs.org/>. - - To compile this file system support as a module, choose M here: the - module will be called ntfs. - - If you are not using Windows NT, 2000, XP or 2003 in addition to - Linux on your computer it is safe to say N. - -config NTFS_DEBUG - bool "NTFS debugging support" - depends on NTFS_FS - help - If you are experiencing any problems with the NTFS file system, say - Y here. This will result in additional consistency checks to be - performed by the driver as well as additional debugging messages to - be written to the system log. Note that debugging messages are - disabled by default. To enable them, supply the option debug_msgs=1 - at the kernel command line when booting the kernel or as an option - to insmod when loading the ntfs module. Once the driver is active, - you can enable debugging messages by doing (as root): - echo 1 > /proc/sys/fs/ntfs-debug - Replacing the "1" with "0" would disable debug messages. - - If you leave debugging messages disabled, this results in little - overhead, but enabling debug messages results in very significant - slowdown of the system. - - When reporting bugs, please try to have available a full dump of - debugging messages while the misbehaviour was occurring. - -config NTFS_RW - bool "NTFS write support" - depends on NTFS_FS - depends on PAGE_SIZE_LESS_THAN_64KB - help - This enables the partial, but safe, write support in the NTFS driver. - - The only supported operation is overwriting existing files, without - changing the file length. No file or directory creation, deletion or - renaming is possible. Note only non-resident files can be written to - so you may find that some very small files (<500 bytes or so) cannot - be written to. - - While we cannot guarantee that it will not damage any data, we have - so far not received a single report where the driver would have - damaged someones data so we assume it is perfectly safe to use. - - Note: While write support is safe in this version (a rewrite from - scratch of the NTFS support), it should be noted that the old NTFS - write support, included in Linux 2.5.10 and before (since 1997), - is not safe. - - This is currently useful with TopologiLinux. TopologiLinux is run - on top of any DOS/Microsoft Windows system without partitioning your - hard disk. Unlike other Linux distributions TopologiLinux does not - need its own partition. For more information see - <http://topologi-linux.sourceforge.net/> - - It is perfectly safe to say N here. diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile deleted file mode 100644 index 3e736572ed00..000000000000 --- a/fs/ntfs/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Rules for making the NTFS driver. - -obj-$(CONFIG_NTFS_FS) += ntfs.o - -ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ - index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ - unistr.o upcase.o - -ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o - -ccflags-y := -DNTFS_VERSION=\"2.1.32\" -ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG -ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW - diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c deleted file mode 100644 index 2d01517a2d59..000000000000 --- a/fs/ntfs/aops.c +++ /dev/null @@ -1,1744 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * aops.c - NTFS kernel address space operations and page cache handling. - * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/gfp.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <linux/buffer_head.h> -#include <linux/writeback.h> -#include <linux/bit_spinlock.h> -#include <linux/bio.h> - -#include "aops.h" -#include "attrib.h" -#include "debug.h" -#include "inode.h" -#include "mft.h" -#include "runlist.h" -#include "types.h" -#include "ntfs.h" - -/** - * ntfs_end_buffer_async_read - async io completion for reading attributes - * @bh: buffer head on which io is completed - * @uptodate: whether @bh is now uptodate or not - * - * Asynchronous I/O completion handler for reading pages belonging to the - * attribute address space of an inode. The inodes can either be files or - * directories or they can be fake inodes describing some attribute. - * - * If NInoMstProtected(), perform the post read mst fixups when all IO on the - * page has been completed and mark the page uptodate or set the error bit on - * the page. To determine the size of the records that need fixing up, we - * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs - * record size, and index_block_size_bits, to the log(base 2) of the ntfs - * record size. - */ -static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) -{ - unsigned long flags; - struct buffer_head *first, *tmp; - struct page *page; - struct inode *vi; - ntfs_inode *ni; - int page_uptodate = 1; - - page = bh->b_page; - vi = page->mapping->host; - ni = NTFS_I(vi); - - if (likely(uptodate)) { - loff_t i_size; - s64 file_ofs, init_size; - - set_buffer_uptodate(bh); - - file_ofs = ((s64)page->index << PAGE_SHIFT) + - bh_offset(bh); - read_lock_irqsave(&ni->size_lock, flags); - init_size = ni->initialized_size; - i_size = i_size_read(vi); - read_unlock_irqrestore(&ni->size_lock, flags); - if (unlikely(init_size > i_size)) { - /* Race with shrinking truncate. */ - init_size = i_size; - } - /* Check for the current buffer head overflowing. */ - if (unlikely(file_ofs + bh->b_size > init_size)) { - int ofs; - void *kaddr; - - ofs = 0; - if (file_ofs < init_size) - ofs = init_size - file_ofs; - kaddr = kmap_atomic(page); - memset(kaddr + bh_offset(bh) + ofs, 0, - bh->b_size - ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - } - } else { - clear_buffer_uptodate(bh); - SetPageError(page); - ntfs_error(ni->vol->sb, "Buffer I/O error, logical block " - "0x%llx.", (unsigned long long)bh->b_blocknr); - } - first = page_buffers(page); - spin_lock_irqsave(&first->b_uptodate_lock, flags); - clear_buffer_async_read(bh); - unlock_buffer(bh); - tmp = bh; - do { - if (!buffer_uptodate(tmp)) - page_uptodate = 0; - if (buffer_async_read(tmp)) { - if (likely(buffer_locked(tmp))) - goto still_busy; - /* Async buffers must be locked. */ - BUG(); - } - tmp = tmp->b_this_page; - } while (tmp != bh); - spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - /* - * If none of the buffers had errors then we can set the page uptodate, - * but we first have to perform the post read mst fixups, if the - * attribute is mst protected, i.e. if NInoMstProteced(ni) is true. - * Note we ignore fixup errors as those are detected when - * map_mft_record() is called which gives us per record granularity - * rather than per page granularity. - */ - if (!NInoMstProtected(ni)) { - if (likely(page_uptodate && !PageError(page))) - SetPageUptodate(page); - } else { - u8 *kaddr; - unsigned int i, recs; - u32 rec_size; - - rec_size = ni->itype.index.block_size; - recs = PAGE_SIZE / rec_size; - /* Should have been verified before we got here... */ - BUG_ON(!recs); - kaddr = kmap_atomic(page); - for (i = 0; i < recs; i++) - post_read_mst_fixup((NTFS_RECORD*)(kaddr + - i * rec_size), rec_size); - kunmap_atomic(kaddr); - flush_dcache_page(page); - if (likely(page_uptodate && !PageError(page))) - SetPageUptodate(page); - } - unlock_page(page); - return; -still_busy: - spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - return; -} - -/** - * ntfs_read_block - fill a @folio of an address space with data - * @folio: page cache folio to fill with data - * - * We read each buffer asynchronously and when all buffers are read in, our io - * completion handler ntfs_end_buffer_read_async(), if required, automatically - * applies the mst fixups to the folio before finally marking it uptodate and - * unlocking it. - * - * We only enforce allocated_size limit because i_size is checked for in - * generic_file_read(). - * - * Return 0 on success and -errno on error. - * - * Contains an adapted version of fs/buffer.c::block_read_full_folio(). - */ -static int ntfs_read_block(struct folio *folio) -{ - loff_t i_size; - VCN vcn; - LCN lcn; - s64 init_size; - struct inode *vi; - ntfs_inode *ni; - ntfs_volume *vol; - runlist_element *rl; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - sector_t iblock, lblock, zblock; - unsigned long flags; - unsigned int blocksize, vcn_ofs; - int i, nr; - unsigned char blocksize_bits; - - vi = folio->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - - /* $MFT/$DATA must have its complete runlist in memory at all times. */ - BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); - - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - - head = folio_buffers(folio); - if (!head) - head = create_empty_buffers(folio, blocksize, 0); - bh = head; - - /* - * We may be racing with truncate. To avoid some of the problems we - * now take a snapshot of the various sizes and use those for the whole - * of the function. In case of an extending truncate it just means we - * may leave some buffers unmapped which are now allocated. This is - * not a problem since these buffers will just get mapped when a write - * occurs. In case of a shrinking truncate, we will detect this later - * on due to the runlist being incomplete and if the folio is being - * fully truncated, truncate will throw it away as soon as we unlock - * it so no need to worry what we do with it. - */ - iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); - read_lock_irqsave(&ni->size_lock, flags); - lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; - init_size = ni->initialized_size; - i_size = i_size_read(vi); - read_unlock_irqrestore(&ni->size_lock, flags); - if (unlikely(init_size > i_size)) { - /* Race with shrinking truncate. */ - init_size = i_size; - } - zblock = (init_size + blocksize - 1) >> blocksize_bits; - - /* Loop through all the buffers in the folio. */ - rl = NULL; - nr = i = 0; - do { - int err = 0; - - if (unlikely(buffer_uptodate(bh))) - continue; - if (unlikely(buffer_mapped(bh))) { - arr[nr++] = bh; - continue; - } - bh->b_bdev = vol->sb->s_bdev; - /* Is the block within the allowed limits? */ - if (iblock < lblock) { - bool is_retry = false; - - /* Convert iblock into corresponding vcn and offset. */ - vcn = (VCN)iblock << blocksize_bits >> - vol->cluster_size_bits; - vcn_ofs = ((VCN)iblock << blocksize_bits) & - vol->cluster_size_mask; - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - /* Successful remap. */ - if (lcn >= 0) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << vol->cluster_size_bits) - + vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - /* Only read initialized data blocks. */ - if (iblock < zblock) { - arr[nr++] = bh; - continue; - } - /* Fully non-initialized data block, zero it. */ - goto handle_zblock; - } - /* It is a hole, need to zero it. */ - if (lcn == LCN_HOLE) - goto handle_hole; - /* If first try and runlist unmapped, map and retry. */ - if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { - is_retry = true; - /* - * Attempt to map runlist, dropping lock for - * the duration. - */ - up_read(&ni->runlist.lock); - err = ntfs_map_runlist(ni, vcn); - if (likely(!err)) - goto lock_retry_remap; - rl = NULL; - } else if (!rl) - up_read(&ni->runlist.lock); - /* - * If buffer is outside the runlist, treat it as a - * hole. This can happen due to concurrent truncate - * for example. - */ - if (err == -ENOENT || lcn == LCN_ENOENT) { - err = 0; - goto handle_hole; - } - /* Hard error, zero out region. */ - if (!err) - err = -EIO; - bh->b_blocknr = -1; - folio_set_error(folio); - ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, " - "offset 0x%x because its location on " - "disk could not be determined%s " - "(error code %i).", ni->mft_no, - ni->type, (unsigned long long)vcn, - vcn_ofs, is_retry ? " even after " - "retrying" : "", err); - } - /* - * Either iblock was outside lblock limits or - * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion - * of the folio and set the buffer uptodate. - */ -handle_hole: - bh->b_blocknr = -1UL; - clear_buffer_mapped(bh); -handle_zblock: - folio_zero_range(folio, i * blocksize, blocksize); - if (likely(!err)) - set_buffer_uptodate(bh); - } while (i++, iblock++, (bh = bh->b_this_page) != head); - - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* Check we have at least one buffer ready for i/o. */ - if (nr) { - struct buffer_head *tbh; - - /* Lock the buffers. */ - for (i = 0; i < nr; i++) { - tbh = arr[i]; - lock_buffer(tbh); - tbh->b_end_io = ntfs_end_buffer_async_read; - set_buffer_async_read(tbh); - } - /* Finally, start i/o on the buffers. */ - for (i = 0; i < nr; i++) { - tbh = arr[i]; - if (likely(!buffer_uptodate(tbh))) - submit_bh(REQ_OP_READ, tbh); - else - ntfs_end_buffer_async_read(tbh, 1); - } - return 0; - } - /* No i/o was scheduled on any of the buffers. */ - if (likely(!folio_test_error(folio))) - folio_mark_uptodate(folio); - else /* Signal synchronous i/o error. */ - nr = -EIO; - folio_unlock(folio); - return nr; -} - -/** - * ntfs_read_folio - fill a @folio of a @file with data from the device - * @file: open file to which the folio @folio belongs or NULL - * @folio: page cache folio to fill with data - * - * For non-resident attributes, ntfs_read_folio() fills the @folio of the open - * file @file by calling the ntfs version of the generic block_read_full_folio() - * function, ntfs_read_block(), which in turn creates and reads in the buffers - * associated with the folio asynchronously. - * - * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the - * data from the mft record (which at this stage is most likely in memory) and - * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as - * even if the mft record is not cached at this point in time, we need to wait - * for it to be read in before we can do the copy. - * - * Return 0 on success and -errno on error. - */ -static int ntfs_read_folio(struct file *file, struct folio *folio) -{ - struct page *page = &folio->page; - loff_t i_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - u8 *addr; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *mrec; - unsigned long flags; - u32 attr_len; - int err = 0; - -retry_readpage: - BUG_ON(!PageLocked(page)); - vi = page->mapping->host; - i_size = i_size_read(vi); - /* Is the page fully outside i_size? (truncate in progress) */ - if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> - PAGE_SHIFT)) { - zero_user(page, 0, PAGE_SIZE); - ntfs_debug("Read outside i_size - truncated?"); - goto done; - } - /* - * This can potentially happen because we clear PageUptodate() during - * ntfs_writepage() of MstProtected() attributes. - */ - if (PageUptodate(page)) { - unlock_page(page); - return 0; - } - ni = NTFS_I(vi); - /* - * Only $DATA attributes can be encrypted and only unnamed $DATA - * attributes can be compressed. Index root can have the flags set but - * this means to create compressed/encrypted files, not that the - * attribute is compressed/encrypted. Note we need to check for - * AT_INDEX_ALLOCATION since this is the type of both directory and - * index inodes. - */ - if (ni->type != AT_INDEX_ALLOCATION) { - /* If attribute is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - BUG_ON(ni->type != AT_DATA); - err = -EACCES; - goto err_out; - } - /* Compressed data streams are handled in compress.c. */ - if (NInoNonResident(ni) && NInoCompressed(ni)) { - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - return ntfs_read_compressed_block(page); - } - } - /* NInoNonResident() == NInoIndexAllocPresent() */ - if (NInoNonResident(ni)) { - /* Normal, non-resident data stream. */ - return ntfs_read_block(folio); - } - /* - * Attribute is resident, implying it is not compressed or encrypted. - * This also means the attribute is smaller than an mft record and - * hence smaller than a page, so can simply zero out any pages with - * index above 0. Note the attribute can actually be marked compressed - * but if it is resident the actual data is not compressed so we are - * ok to ignore the compressed flag here. - */ - if (unlikely(page->index > 0)) { - zero_user(page, 0, PAGE_SIZE); - goto done; - } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - mrec = map_mft_record(base_ni); - if (IS_ERR(mrec)) { - err = PTR_ERR(mrec); - goto err_out; - } - /* - * If a parallel write made the attribute non-resident, drop the mft - * record and retry the read_folio. - */ - if (unlikely(NInoNonResident(ni))) { - unmap_mft_record(base_ni); - goto retry_readpage; - } - ctx = ntfs_attr_get_search_ctx(base_ni, mrec); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto unm_err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto put_unm_err_out; - attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); - read_lock_irqsave(&ni->size_lock, flags); - if (unlikely(attr_len > ni->initialized_size)) - attr_len = ni->initialized_size; - i_size = i_size_read(vi); - read_unlock_irqrestore(&ni->size_lock, flags); - if (unlikely(attr_len > i_size)) { - /* Race with shrinking truncate. */ - attr_len = i_size; - } - addr = kmap_atomic(page); - /* Copy the data to the page. */ - memcpy(addr, (u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), - attr_len); - /* Zero the remainder of the page. */ - memset(addr + attr_len, 0, PAGE_SIZE - attr_len); - flush_dcache_page(page); - kunmap_atomic(addr); -put_unm_err_out: - ntfs_attr_put_search_ctx(ctx); -unm_err_out: - unmap_mft_record(base_ni); -done: - SetPageUptodate(page); -err_out: - unlock_page(page); - return err; -} - -#ifdef NTFS_RW - -/** - * ntfs_write_block - write a @folio to the backing store - * @folio: page cache folio to write out - * @wbc: writeback control structure - * - * This function is for writing folios belonging to non-resident, non-mst - * protected attributes to their backing store. - * - * For a folio with buffers, map and write the dirty buffers asynchronously - * under folio writeback. For a folio without buffers, create buffers for the - * folio, then proceed as above. - * - * If a folio doesn't have buffers the folio dirty state is definitive. If - * a folio does have buffers, the folio dirty state is just a hint, - * and the buffer dirty state is definitive. (A hint which has rules: - * dirty buffers against a clean folio is illegal. Other combinations are - * legal and need to be handled. In particular a dirty folio containing - * clean buffers for example.) - * - * Return 0 on success and -errno on error. - * - * Based on ntfs_read_block() and __block_write_full_folio(). - */ -static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc) -{ - VCN vcn; - LCN lcn; - s64 initialized_size; - loff_t i_size; - sector_t block, dblock, iblock; - struct inode *vi; - ntfs_inode *ni; - ntfs_volume *vol; - runlist_element *rl; - struct buffer_head *bh, *head; - unsigned long flags; - unsigned int blocksize, vcn_ofs; - int err; - bool need_end_writeback; - unsigned char blocksize_bits; - - vi = folio->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " - "0x%lx.", ni->mft_no, ni->type, folio->index); - - BUG_ON(!NInoNonResident(ni)); - BUG_ON(NInoMstProtected(ni)); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - head = folio_buffers(folio); - if (!head) { - BUG_ON(!folio_test_uptodate(folio)); - head = create_empty_buffers(folio, blocksize, - (1 << BH_Uptodate) | (1 << BH_Dirty)); - } - bh = head; - - /* NOTE: Different naming scheme to ntfs_read_block()! */ - - /* The first block in the folio. */ - block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); - - read_lock_irqsave(&ni->size_lock, flags); - i_size = i_size_read(vi); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - - /* The first out of bounds block for the data size. */ - dblock = (i_size + blocksize - 1) >> blocksize_bits; - - /* The last (fully or partially) initialized block. */ - iblock = initialized_size >> blocksize_bits; - - /* - * Be very careful. We have no exclusion from block_dirty_folio - * here, and the (potentially unmapped) buffers may become dirty at - * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the folio stays dirty. - * - * Buffers outside i_size may be dirtied by block_dirty_folio; - * handle that here by just cleaning them. - */ - - /* - * Loop through all the buffers in the folio, mapping all the dirty - * buffers to disk addresses and handling any aliases from the - * underlying block device's mapping. - */ - rl = NULL; - err = 0; - do { - bool is_retry = false; - - if (unlikely(block >= dblock)) { - /* - * Mapped buffers outside i_size will occur, because - * this folio can be outside i_size when there is a - * truncate in progress. The contents of such buffers - * were zeroed by ntfs_writepage(). - * - * FIXME: What about the small race window where - * ntfs_writepage() has not done any clearing because - * the folio was within i_size but before we get here, - * vmtruncate() modifies i_size? - */ - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - continue; - } - - /* Clean buffers are not written out, so no need to map them. */ - if (!buffer_dirty(bh)) - continue; - - /* Make sure we have enough initialized size. */ - if (unlikely((block >= iblock) && - (initialized_size < i_size))) { - /* - * If this folio is fully outside initialized - * size, zero out all folios between the current - * initialized size and the current folio. Just - * use ntfs_read_folio() to do the zeroing - * transparently. - */ - if (block > iblock) { - // TODO: - // For each folio do: - // - read_cache_folio() - // Again for each folio do: - // - wait_on_folio_locked() - // - Check (folio_test_uptodate(folio) && - // !folio_test_error(folio)) - // Update initialized size in the attribute and - // in the inode. - // Again, for each folio do: - // block_dirty_folio(); - // folio_put() - // We don't need to wait on the writes. - // Update iblock. - } - /* - * The current folio straddles initialized size. Zero - * all non-uptodate buffers and set them uptodate (and - * dirty?). Note, there aren't any non-uptodate buffers - * if the folio is uptodate. - * FIXME: For an uptodate folio, the buffers may need to - * be written out because they were not initialized on - * disk before. - */ - if (!folio_test_uptodate(folio)) { - // TODO: - // Zero any non-uptodate buffers up to i_size. - // Set them uptodate and dirty. - } - // TODO: - // Update initialized size in the attribute and in the - // inode (up to i_size). - // Update iblock. - // FIXME: This is inefficient. Try to batch the two - // size changes to happen in one go. - ntfs_error(vol->sb, "Writing beyond initialized size " - "is not supported yet. Sorry."); - err = -EOPNOTSUPP; - break; - // Do NOT set_buffer_new() BUT DO clear buffer range - // outside write request range. - // set_buffer_uptodate() on complete buffers as well as - // set_buffer_dirty(). - } - - /* No need to map buffers that are already mapped. */ - if (buffer_mapped(bh)) - continue; - - /* Unmapped, dirty buffer. Need to map it. */ - bh->b_bdev = vol->sb->s_bdev; - - /* Convert block into corresponding vcn and offset. */ - vcn = (VCN)block << blocksize_bits; - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - /* Successful remap. */ - if (lcn >= 0) { - /* Setup buffer head to point to correct block. */ - bh->b_blocknr = ((lcn << vol->cluster_size_bits) + - vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - continue; - } - /* It is a hole, need to instantiate it. */ - if (lcn == LCN_HOLE) { - u8 *kaddr; - unsigned long *bpos, *bend; - - /* Check if the buffer is zero. */ - kaddr = kmap_local_folio(folio, bh_offset(bh)); - bpos = (unsigned long *)kaddr; - bend = (unsigned long *)(kaddr + blocksize); - do { - if (unlikely(*bpos)) - break; - } while (likely(++bpos < bend)); - kunmap_local(kaddr); - if (bpos == bend) { - /* - * Buffer is zero and sparse, no need to write - * it. - */ - bh->b_blocknr = -1; - clear_buffer_dirty(bh); - continue; - } - // TODO: Instantiate the hole. - // clear_buffer_new(bh); - // clean_bdev_bh_alias(bh); - ntfs_error(vol->sb, "Writing into sparse regions is " - "not supported yet. Sorry."); - err = -EOPNOTSUPP; - break; - } - /* If first try and runlist unmapped, map and retry. */ - if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { - is_retry = true; - /* - * Attempt to map runlist, dropping lock for - * the duration. - */ - up_read(&ni->runlist.lock); - err = ntfs_map_runlist(ni, vcn); - if (likely(!err)) - goto lock_retry_remap; - rl = NULL; - } else if (!rl) - up_read(&ni->runlist.lock); - /* - * If buffer is outside the runlist, truncate has cut it out - * of the runlist. Just clean and clear the buffer and set it - * uptodate so it can get discarded by the VM. - */ - if (err == -ENOENT || lcn == LCN_ENOENT) { - bh->b_blocknr = -1; - clear_buffer_dirty(bh); - folio_zero_range(folio, bh_offset(bh), blocksize); - set_buffer_uptodate(bh); - err = 0; - continue; - } - /* Failed to map the buffer, even after retrying. */ - if (!err) - err = -EIO; - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, offset 0x%x " - "because its location on disk could not be " - "determined%s (error code %i).", ni->mft_no, - ni->type, (unsigned long long)vcn, - vcn_ofs, is_retry ? " even after " - "retrying" : "", err); - break; - } while (block++, (bh = bh->b_this_page) != head); - - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* For the error case, need to reset bh to the beginning. */ - bh = head; - - /* Just an optimization, so ->read_folio() is not called later. */ - if (unlikely(!folio_test_uptodate(folio))) { - int uptodate = 1; - do { - if (!buffer_uptodate(bh)) { - uptodate = 0; - bh = head; - break; - } - } while ((bh = bh->b_this_page) != head); - if (uptodate) - folio_mark_uptodate(folio); - } - - /* Setup all mapped, dirty buffers for async write i/o. */ - do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { - lock_buffer(bh); - if (test_clear_buffer_dirty(bh)) { - BUG_ON(!buffer_uptodate(bh)); - mark_buffer_async_write(bh); - } else - unlock_buffer(bh); - } else if (unlikely(err)) { - /* - * For the error case. The buffer may have been set - * dirty during attachment to a dirty folio. - */ - if (err != -ENOMEM) - clear_buffer_dirty(bh); - } - } while ((bh = bh->b_this_page) != head); - - if (unlikely(err)) { - // TODO: Remove the -EOPNOTSUPP check later on... - if (unlikely(err == -EOPNOTSUPP)) - err = 0; - else if (err == -ENOMEM) { - ntfs_warning(vol->sb, "Error allocating memory. " - "Redirtying folio so we try again " - "later."); - /* - * Put the folio back on mapping->dirty_pages, but - * leave its buffer's dirty state as-is. - */ - folio_redirty_for_writepage(wbc, folio); - err = 0; - } else - folio_set_error(folio); - } - - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */ - - /* Submit the prepared buffers for i/o. */ - need_end_writeback = true; - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, bh); - need_end_writeback = false; - } - bh = next; - } while (bh != head); - folio_unlock(folio); - - /* If no i/o was started, need to end writeback here. */ - if (unlikely(need_end_writeback)) - folio_end_writeback(folio); - - ntfs_debug("Done."); - return err; -} - -/** - * ntfs_write_mst_block - write a @page to the backing store - * @page: page cache page to write out - * @wbc: writeback control structure - * - * This function is for writing pages belonging to non-resident, mst protected - * attributes to their backing store. The only supported attributes are index - * allocation and $MFT/$DATA. Both directory inodes and index inodes are - * supported for the index allocation case. - * - * The page must remain locked for the duration of the write because we apply - * the mst fixups, write, and then undo the fixups, so if we were to unlock the - * page before undoing the fixups, any other user of the page will see the - * page contents as corrupt. - * - * We clear the page uptodate flag for the duration of the function to ensure - * exclusion for the $MFT/$DATA case against someone mapping an mft record we - * are about to apply the mst fixups to. - * - * Return 0 on success and -errno on error. - * - * Based on ntfs_write_block(), ntfs_mft_writepage(), and - * write_mft_record_nolock(). - */ -static int ntfs_write_mst_block(struct page *page, - struct writeback_control *wbc) -{ - sector_t block, dblock, rec_block; - struct inode *vi = page->mapping->host; - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - u8 *kaddr; - unsigned int rec_size = ni->itype.index.block_size; - ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE]; - struct buffer_head *bh, *head, *tbh, *rec_start_bh; - struct buffer_head *bhs[MAX_BUF_PER_PAGE]; - runlist_element *rl; - int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2; - unsigned bh_size, rec_size_bits; - bool sync, is_mft, page_is_dirty, rec_is_dirty; - unsigned char bh_size_bits; - - if (WARN_ON(rec_size < NTFS_BLOCK_SIZE)) - return -EINVAL; - - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " - "0x%lx.", vi->i_ino, ni->type, page->index); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(!NInoMstProtected(ni)); - is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino); - /* - * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page - * in its page cache were to be marked dirty. However this should - * never happen with the current driver and considering we do not - * handle this case here we do want to BUG(), at least for now. - */ - BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || - (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); - bh_size = vol->sb->s_blocksize; - bh_size_bits = vol->sb->s_blocksize_bits; - max_bhs = PAGE_SIZE / bh_size; - BUG_ON(!max_bhs); - BUG_ON(max_bhs > MAX_BUF_PER_PAGE); - - /* Were we called for sync purposes? */ - sync = (wbc->sync_mode == WB_SYNC_ALL); - - /* Make sure we have mapped buffers. */ - bh = head = page_buffers(page); - BUG_ON(!bh); - - rec_size_bits = ni->itype.index.block_size_bits; - BUG_ON(!(PAGE_SIZE >> rec_size_bits)); - bhs_per_rec = rec_size >> bh_size_bits; - BUG_ON(!bhs_per_rec); - - /* The first block in the page. */ - rec_block = block = (sector_t)page->index << - (PAGE_SHIFT - bh_size_bits); - - /* The first out of bounds block for the data size. */ - dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits; - - rl = NULL; - err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0; - page_is_dirty = rec_is_dirty = false; - rec_start_bh = NULL; - do { - bool is_retry = false; - - if (likely(block < rec_block)) { - if (unlikely(block >= dblock)) { - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - continue; - } - /* - * This block is not the first one in the record. We - * ignore the buffer's dirty state because we could - * have raced with a parallel mark_ntfs_record_dirty(). - */ - if (!rec_is_dirty) - continue; - if (unlikely(err2)) { - if (err2 != -ENOMEM) - clear_buffer_dirty(bh); - continue; - } - } else /* if (block == rec_block) */ { - BUG_ON(block > rec_block); - /* This block is the first one in the record. */ - rec_block += bhs_per_rec; - err2 = 0; - if (unlikely(block >= dblock)) { - clear_buffer_dirty(bh); - continue; - } - if (!buffer_dirty(bh)) { - /* Clean records are not written out. */ - rec_is_dirty = false; - continue; - } - rec_is_dirty = true; - rec_start_bh = bh; - } - /* Need to map the buffer if it is not mapped already. */ - if (unlikely(!buffer_mapped(bh))) { - VCN vcn; - LCN lcn; - unsigned int vcn_ofs; - - bh->b_bdev = vol->sb->s_bdev; - /* Obtain the vcn and offset of the current block. */ - vcn = (VCN)block << bh_size_bits; - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - /* Successful remap. */ - if (likely(lcn >= 0)) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << - vol->cluster_size_bits) + - vcn_ofs) >> bh_size_bits; - set_buffer_mapped(bh); - } else { - /* - * Remap failed. Retry to map the runlist once - * unless we are working on $MFT which always - * has the whole of its runlist in memory. - */ - if (!is_mft && !is_retry && - lcn == LCN_RL_NOT_MAPPED) { - is_retry = true; - /* - * Attempt to map runlist, dropping - * lock for the duration. - */ - up_read(&ni->runlist.lock); - err2 = ntfs_map_runlist(ni, vcn); - if (likely(!err2)) - goto lock_retry_remap; - if (err2 == -ENOMEM) - page_is_dirty = true; - lcn = err2; - } else { - err2 = -EIO; - if (!rl) - up_read(&ni->runlist.lock); - } - /* Hard error. Abort writing this record. */ - if (!err || err == -ENOMEM) - err = err2; - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Cannot write ntfs record " - "0x%llx (inode 0x%lx, " - "attribute type 0x%x) because " - "its location on disk could " - "not be determined (error " - "code %lli).", - (long long)block << - bh_size_bits >> - vol->mft_record_size_bits, - ni->mft_no, ni->type, - (long long)lcn); - /* - * If this is not the first buffer, remove the - * buffers in this record from the list of - * buffers to write and clear their dirty bit - * if not error -ENOMEM. - */ - if (rec_start_bh != bh) { - while (bhs[--nr_bhs] != rec_start_bh) - ; - if (err2 != -ENOMEM) { - do { - clear_buffer_dirty( - rec_start_bh); - } while ((rec_start_bh = - rec_start_bh-> - b_this_page) != - bh); - } - } - continue; - } - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(nr_bhs >= max_bhs); - bhs[nr_bhs++] = bh; - } while (block++, (bh = bh->b_this_page) != head); - if (unlikely(rl)) - up_read(&ni->runlist.lock); - /* If there were no dirty buffers, we are done. */ - if (!nr_bhs) - goto done; - /* Map the page so we can access its contents. */ - kaddr = kmap(page); - /* Clear the page uptodate flag whilst the mst fixups are applied. */ - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - for (i = 0; i < nr_bhs; i++) { - unsigned int ofs; - - /* Skip buffers which are not at the beginning of records. */ - if (i % bhs_per_rec) - continue; - tbh = bhs[i]; - ofs = bh_offset(tbh); - if (is_mft) { - ntfs_inode *tni; - unsigned long mft_no; - - /* Get the mft record number. */ - mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) - >> rec_size_bits; - /* Check whether to write this mft record. */ - tni = NULL; - if (!ntfs_may_write_mft_record(vol, mft_no, - (MFT_RECORD*)(kaddr + ofs), &tni)) { - /* - * The record should not be written. This - * means we need to redirty the page before - * returning. - */ - page_is_dirty = true; - /* - * Remove the buffers in this mft record from - * the list of buffers to write. - */ - do { - bhs[i] = NULL; - } while (++i % bhs_per_rec); - continue; - } - /* - * The record should be written. If a locked ntfs - * inode was returned, add it to the array of locked - * ntfs inodes. - */ - if (tni) - locked_nis[nr_locked_nis++] = tni; - } - /* Apply the mst protection fixups. */ - err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs), - rec_size); - if (unlikely(err2)) { - if (!err || err == -ENOMEM) - err = -EIO; - ntfs_error(vol->sb, "Failed to apply mst fixups " - "(inode 0x%lx, attribute type 0x%x, " - "page index 0x%lx, page offset 0x%x)!" - " Unmount and run chkdsk.", vi->i_ino, - ni->type, page->index, ofs); - /* - * Mark all the buffers in this record clean as we do - * not want to write corrupt data to disk. - */ - do { - clear_buffer_dirty(bhs[i]); - bhs[i] = NULL; - } while (++i % bhs_per_rec); - continue; - } - nr_recs++; - } - /* If no records are to be written out, we are done. */ - if (!nr_recs) - goto unm_done; - flush_dcache_page(page); - /* Lock buffers and start synchronous write i/o on them. */ - for (i = 0; i < nr_bhs; i++) { - tbh = bhs[i]; - if (!tbh) - continue; - if (!trylock_buffer(tbh)) - BUG(); - /* The buffer dirty state is now irrelevant, just clean it. */ - clear_buffer_dirty(tbh); - BUG_ON(!buffer_uptodate(tbh)); - BUG_ON(!buffer_mapped(tbh)); - get_bh(tbh); - tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, tbh); - } - /* Synchronize the mft mirror now if not @sync. */ - if (is_mft && !sync) - goto do_mirror; -do_wait: - /* Wait on i/o completion of buffers. */ - for (i = 0; i < nr_bhs; i++) { - tbh = bhs[i]; - if (!tbh) - continue; - wait_on_buffer(tbh); - if (unlikely(!buffer_uptodate(tbh))) { - ntfs_error(vol->sb, "I/O error while writing ntfs " - "record buffer (inode 0x%lx, " - "attribute type 0x%x, page index " - "0x%lx, page offset 0x%lx)! Unmount " - "and run chkdsk.", vi->i_ino, ni->type, - page->index, bh_offset(tbh)); - if (!err || err == -ENOMEM) - err = -EIO; - /* - * Set the buffer uptodate so the page and buffer - * states do not become out of sync. - */ - set_buffer_uptodate(tbh); - } - } - /* If @sync, now synchronize the mft mirror. */ - if (is_mft && sync) { -do_mirror: - for (i = 0; i < nr_bhs; i++) { - unsigned long mft_no; - unsigned int ofs; - - /* - * Skip buffers which are not at the beginning of - * records. - */ - if (i % bhs_per_rec) - continue; - tbh = bhs[i]; - /* Skip removed buffers (and hence records). */ - if (!tbh) - continue; - ofs = bh_offset(tbh); - /* Get the mft record number. */ - mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) - >> rec_size_bits; - if (mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, mft_no, - (MFT_RECORD*)(kaddr + ofs), - sync); - } - if (!sync) - goto do_wait; - } - /* Remove the mst protection fixups again. */ - for (i = 0; i < nr_bhs; i++) { - if (!(i % bhs_per_rec)) { - tbh = bhs[i]; - if (!tbh) - continue; - post_write_mst_fixup((NTFS_RECORD*)(kaddr + - bh_offset(tbh))); - } - } - flush_dcache_page(page); -unm_done: - /* Unlock any locked inodes. */ - while (nr_locked_nis-- > 0) { - ntfs_inode *tni, *base_tni; - - tni = locked_nis[nr_locked_nis]; - /* Get the base inode. */ - mutex_lock(&tni->extent_lock); - if (tni->nr_extents >= 0) - base_tni = tni; - else { - base_tni = tni->ext.base_ntfs_ino; - BUG_ON(!base_tni); - } - mutex_unlock(&tni->extent_lock); - ntfs_debug("Unlocking %s inode 0x%lx.", - tni == base_tni ? "base" : "extent", - tni->mft_no); - mutex_unlock(&tni->mrec_lock); - atomic_dec(&tni->count); - iput(VFS_I(base_tni)); - } - SetPageUptodate(page); - kunmap(page); -done: - if (unlikely(err && err != -ENOMEM)) { - /* - * Set page error if there is only one ntfs record in the page. - * Otherwise we would loose per-record granularity. - */ - if (ni->itype.index.block_size == PAGE_SIZE) - SetPageError(page); - NVolSetErrors(vol); - } - if (page_is_dirty) { - ntfs_debug("Page still contains one or more dirty ntfs " - "records. Redirtying the page starting at " - "record 0x%lx.", page->index << - (PAGE_SHIFT - rec_size_bits)); - redirty_page_for_writepage(wbc, page); - unlock_page(page); - } else { - /* - * Keep the VM happy. This must be done otherwise the - * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though - * the page is clean. - */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); - } - if (likely(!err)) - ntfs_debug("Done."); - return err; -} - -/** - * ntfs_writepage - write a @page to the backing store - * @page: page cache page to write out - * @wbc: writeback control structure - * - * This is called from the VM when it wants to have a dirty ntfs page cache - * page cleaned. The VM has already locked the page and marked it clean. - * - * For non-resident attributes, ntfs_writepage() writes the @page by calling - * the ntfs version of the generic block_write_full_folio() function, - * ntfs_write_block(), which in turn if necessary creates and writes the - * buffers associated with the page asynchronously. - * - * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying - * the data to the mft record (which at this stage is most likely in memory). - * The mft record is then marked dirty and written out asynchronously via the - * vfs inode dirty code path for the inode the mft record belongs to or via the - * vm page dirty code path for the page the mft record is in. - * - * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio(). - * - * Return 0 on success and -errno on error. - */ -static int ntfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - loff_t i_size; - struct inode *vi = folio->mapping->host; - ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); - char *addr; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *m = NULL; - u32 attr_len; - int err; - -retry_writepage: - BUG_ON(!folio_test_locked(folio)); - i_size = i_size_read(vi); - /* Is the folio fully outside i_size? (truncate in progress) */ - if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >> - PAGE_SHIFT)) { - /* - * The folio may have dirty, unmapped buffers. Make them - * freeable here, so the page does not leak. - */ - block_invalidate_folio(folio, 0, folio_size(folio)); - folio_unlock(folio); - ntfs_debug("Write outside i_size - truncated?"); - return 0; - } - /* - * Only $DATA attributes can be encrypted and only unnamed $DATA - * attributes can be compressed. Index root can have the flags set but - * this means to create compressed/encrypted files, not that the - * attribute is compressed/encrypted. Note we need to check for - * AT_INDEX_ALLOCATION since this is the type of both directory and - * index inodes. - */ - if (ni->type != AT_INDEX_ALLOCATION) { - /* If file is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - folio_unlock(folio); - BUG_ON(ni->type != AT_DATA); - ntfs_debug("Denying write access to encrypted file."); - return -EACCES; - } - /* Compressed data streams are handled in compress.c. */ - if (NInoNonResident(ni) && NInoCompressed(ni)) { - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - // TODO: Implement and replace this with - // return ntfs_write_compressed_block(page); - folio_unlock(folio); - ntfs_error(vi->i_sb, "Writing to compressed files is " - "not supported yet. Sorry."); - return -EOPNOTSUPP; - } - // TODO: Implement and remove this check. - if (NInoNonResident(ni) && NInoSparse(ni)) { - folio_unlock(folio); - ntfs_error(vi->i_sb, "Writing to sparse files is not " - "supported yet. Sorry."); - return -EOPNOTSUPP; - } - } - /* NInoNonResident() == NInoIndexAllocPresent() */ - if (NInoNonResident(ni)) { - /* We have to zero every time due to mmap-at-end-of-file. */ - if (folio->index >= (i_size >> PAGE_SHIFT)) { - /* The folio straddles i_size. */ - unsigned int ofs = i_size & (folio_size(folio) - 1); - folio_zero_segment(folio, ofs, folio_size(folio)); - } - /* Handle mst protected attributes. */ - if (NInoMstProtected(ni)) - return ntfs_write_mst_block(page, wbc); - /* Normal, non-resident data stream. */ - return ntfs_write_block(folio, wbc); - } - /* - * Attribute is resident, implying it is not compressed, encrypted, or - * mst protected. This also means the attribute is smaller than an mft - * record and hence smaller than a folio, so can simply return error on - * any folios with index above 0. Note the attribute can actually be - * marked compressed but if it is resident the actual data is not - * compressed so we are ok to ignore the compressed flag here. - */ - BUG_ON(folio_buffers(folio)); - BUG_ON(!folio_test_uptodate(folio)); - if (unlikely(folio->index > 0)) { - ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. " - "Aborting write.", folio->index); - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - folio_end_writeback(folio); - return -EIO; - } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - /* - * If a parallel write made the attribute non-resident, drop the mft - * record and retry the writepage. - */ - if (unlikely(NInoNonResident(ni))) { - unmap_mft_record(base_ni); - goto retry_writepage; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto err_out; - /* - * Keep the VM happy. This must be done otherwise - * PAGECACHE_TAG_DIRTY remains set even though the folio is clean. - */ - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); - i_size = i_size_read(vi); - if (unlikely(attr_len > i_size)) { - /* Race with shrinking truncate or a failed truncate. */ - attr_len = i_size; - /* - * If the truncate failed, fix it up now. If a concurrent - * truncate, we do its job, so it does not have to do anything. - */ - err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr, - attr_len); - /* Shrinking cannot fail. */ - BUG_ON(err); - } - addr = kmap_local_folio(folio, 0); - /* Copy the data from the folio to the mft record. */ - memcpy((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), - addr, attr_len); - /* Zero out of bounds area in the page cache folio. */ - memset(addr + attr_len, 0, folio_size(folio) - attr_len); - kunmap_local(addr); - flush_dcache_folio(folio); - flush_dcache_mft_record_page(ctx->ntfs_ino); - /* We are done with the folio. */ - folio_end_writeback(folio); - /* Finally, mark the mft record dirty, so it gets written back. */ - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - return 0; -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " - "page so we try again later."); - /* - * Put the folio back on mapping->dirty_pages, but leave its - * buffers' dirty state as-is. - */ - folio_redirty_for_writepage(wbc, folio); - err = 0; - } else { - ntfs_error(vi->i_sb, "Resident attribute write failed with " - "error %i.", err); - folio_set_error(folio); - NVolSetErrors(ni->vol); - } - folio_unlock(folio); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - return err; -} - -#endif /* NTFS_RW */ - -/** - * ntfs_bmap - map logical file block to physical device block - * @mapping: address space mapping to which the block to be mapped belongs - * @block: logical block to map to its physical device block - * - * For regular, non-resident files (i.e. not compressed and not encrypted), map - * the logical @block belonging to the file described by the address space - * mapping @mapping to its physical device block. - * - * The size of the block is equal to the @s_blocksize field of the super block - * of the mounted file system which is guaranteed to be smaller than or equal - * to the cluster size thus the block is guaranteed to fit entirely inside the - * cluster which means we do not need to care how many contiguous bytes are - * available after the beginning of the block. - * - * Return the physical device block if the mapping succeeded or 0 if the block - * is sparse or there was an error. - * - * Note: This is a problem if someone tries to run bmap() on $Boot system file - * as that really is in block zero but there is nothing we can do. bmap() is - * just broken in that respect (just like it cannot distinguish sparse from - * not available or error). - */ -static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) -{ - s64 ofs, size; - loff_t i_size; - LCN lcn; - unsigned long blocksize, flags; - ntfs_inode *ni = NTFS_I(mapping->host); - ntfs_volume *vol = ni->vol; - unsigned delta; - unsigned char blocksize_bits, cluster_size_shift; - - ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.", - ni->mft_no, (unsigned long long)block); - if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) { - ntfs_error(vol->sb, "BMAP does not make sense for %s " - "attributes, returning 0.", - (ni->type != AT_DATA) ? "non-data" : - (!NInoNonResident(ni) ? "resident" : - "encrypted")); - return 0; - } - /* None of these can happen. */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoMstProtected(ni)); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - ofs = (s64)block << blocksize_bits; - read_lock_irqsave(&ni->size_lock, flags); - size = ni->initialized_size; - i_size = i_size_read(VFS_I(ni)); - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * If the offset is outside the initialized size or the block straddles - * the initialized size then pretend it is a hole unless the - * initialized size equals the file size. - */ - if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size))) - goto hole; - cluster_size_shift = vol->cluster_size_bits; - down_read(&ni->runlist.lock); - lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false); - up_read(&ni->runlist.lock); - if (unlikely(lcn < LCN_HOLE)) { - /* - * Step down to an integer to avoid gcc doing a long long - * comparision in the switch when we know @lcn is between - * LCN_HOLE and LCN_EIO (i.e. -1 to -5). - * - * Otherwise older gcc (at least on some architectures) will - * try to use __cmpdi2() which is of course not available in - * the kernel. - */ - switch ((int)lcn) { - case LCN_ENOENT: - /* - * If the offset is out of bounds then pretend it is a - * hole. - */ - goto hole; - case LCN_ENOMEM: - ntfs_error(vol->sb, "Not enough memory to complete " - "mapping for inode 0x%lx. " - "Returning 0.", ni->mft_no); - break; - default: - ntfs_error(vol->sb, "Failed to complete mapping for " - "inode 0x%lx. Run chkdsk. " - "Returning 0.", ni->mft_no); - break; - } - return 0; - } - if (lcn < 0) { - /* It is a hole. */ -hole: - ntfs_debug("Done (returning hole)."); - return 0; - } - /* - * The block is really allocated and fullfils all our criteria. - * Convert the cluster to units of block size and return the result. - */ - delta = ofs & vol->cluster_size_mask; - if (unlikely(sizeof(block) < sizeof(lcn))) { - block = lcn = ((lcn << cluster_size_shift) + delta) >> - blocksize_bits; - /* If the block number was truncated return 0. */ - if (unlikely(block != lcn)) { - ntfs_error(vol->sb, "Physical block 0x%llx is too " - "large to be returned, returning 0.", - (long long)lcn); - return 0; - } - } else - block = ((lcn << cluster_size_shift) + delta) >> - blocksize_bits; - ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn); - return block; -} - -/* - * ntfs_normal_aops - address space operations for normal inodes and attributes - * - * Note these are not used for compressed or mst protected inodes and - * attributes. - */ -const struct address_space_operations ntfs_normal_aops = { - .read_folio = ntfs_read_folio, -#ifdef NTFS_RW - .writepage = ntfs_writepage, - .dirty_folio = block_dirty_folio, -#endif /* NTFS_RW */ - .bmap = ntfs_bmap, - .migrate_folio = buffer_migrate_folio, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_folio = generic_error_remove_folio, -}; - -/* - * ntfs_compressed_aops - address space operations for compressed inodes - */ -const struct address_space_operations ntfs_compressed_aops = { - .read_folio = ntfs_read_folio, -#ifdef NTFS_RW - .writepage = ntfs_writepage, - .dirty_folio = block_dirty_folio, -#endif /* NTFS_RW */ - .migrate_folio = buffer_migrate_folio, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_folio = generic_error_remove_folio, -}; - -/* - * ntfs_mst_aops - general address space operations for mst protecteed inodes - * and attributes - */ -const struct address_space_operations ntfs_mst_aops = { - .read_folio = ntfs_read_folio, /* Fill page with data. */ -#ifdef NTFS_RW - .writepage = ntfs_writepage, /* Write dirty page to disk. */ - .dirty_folio = filemap_dirty_folio, -#endif /* NTFS_RW */ - .migrate_folio = buffer_migrate_folio, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_folio = generic_error_remove_folio, -}; - -#ifdef NTFS_RW - -/** - * mark_ntfs_record_dirty - mark an ntfs record dirty - * @page: page containing the ntfs record to mark dirty - * @ofs: byte offset within @page at which the ntfs record begins - * - * Set the buffers and the page in which the ntfs record is located dirty. - * - * The latter also marks the vfs inode the ntfs record belongs to dirty - * (I_DIRTY_PAGES only). - * - * If the page does not have buffers, we create them and set them uptodate. - * The page may not be locked which is why we need to handle the buffers under - * the mapping->i_private_lock. Once the buffers are marked dirty we no longer - * need the lock since try_to_free_buffers() does not free dirty buffers. - */ -void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { - struct address_space *mapping = page->mapping; - ntfs_inode *ni = NTFS_I(mapping->host); - struct buffer_head *bh, *head, *buffers_to_free = NULL; - unsigned int end, bh_size, bh_ofs; - - BUG_ON(!PageUptodate(page)); - end = ofs + ni->itype.index.block_size; - bh_size = VFS_I(ni)->i_sb->s_blocksize; - spin_lock(&mapping->i_private_lock); - if (unlikely(!page_has_buffers(page))) { - spin_unlock(&mapping->i_private_lock); - bh = head = alloc_page_buffers(page, bh_size, true); - spin_lock(&mapping->i_private_lock); - if (likely(!page_has_buffers(page))) { - struct buffer_head *tail; - - do { - set_buffer_uptodate(bh); - tail = bh; - bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - attach_page_private(page, head); - } else - buffers_to_free = bh; - } - bh = head = page_buffers(page); - BUG_ON(!bh); - do { - bh_ofs = bh_offset(bh); - if (bh_ofs + bh_size <= ofs) - continue; - if (unlikely(bh_ofs >= end)) - break; - set_buffer_dirty(bh); - } while ((bh = bh->b_this_page) != head); - spin_unlock(&mapping->i_private_lock); - filemap_dirty_folio(mapping, page_folio(page)); - if (unlikely(buffers_to_free)) { - do { - bh = buffers_to_free->b_this_page; - free_buffer_head(buffers_to_free); - buffers_to_free = bh; - } while (buffers_to_free); - } -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h deleted file mode 100644 index 8d0958a149cb..000000000000 --- a/fs/ntfs/aops.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * aops.h - Defines for NTFS kernel address space operations and page cache - * handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_AOPS_H -#define _LINUX_NTFS_AOPS_H - -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/fs.h> - -#include "inode.h" - -/** - * ntfs_unmap_page - release a page that was mapped using ntfs_map_page() - * @page: the page to release - * - * Unpin, unmap and release a page that was obtained from ntfs_map_page(). - */ -static inline void ntfs_unmap_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - -/** - * ntfs_map_page - map a page into accessible memory, reading it if necessary - * @mapping: address space for which to obtain the page - * @index: index into the page cache for @mapping of the page to map - * - * Read a page from the page cache of the address space @mapping at position - * @index, where @index is in units of PAGE_SIZE, and not in bytes. - * - * If the page is not in memory it is loaded from disk first using the - * read_folio method defined in the address space operations of @mapping - * and the page is added to the page cache of @mapping in the process. - * - * If the page belongs to an mst protected attribute and it is marked as such - * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no - * error checking is performed. This means the caller has to verify whether - * the ntfs record(s) contained in the page are valid or not using one of the - * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are - * expecting to see. (For details of the macros, see fs/ntfs/layout.h.) - * - * If the page is in high memory it is mapped into memory directly addressible - * by the kernel. - * - * Finally the page count is incremented, thus pinning the page into place. - * - * The above means that page_address(page) can be used on all pages obtained - * with ntfs_map_page() to get the kernel virtual address of the page. - * - * When finished with the page, the caller has to call ntfs_unmap_page() to - * unpin, unmap and release the page. - * - * Note this does not grant exclusive access. If such is desired, the caller - * must provide it independently of the ntfs_{un}map_page() calls by using - * a {rw_}semaphore or other means of serialization. A spin lock cannot be - * used as ntfs_map_page() can block. - * - * The unlocked and uptodate page is returned on success or an encoded error - * on failure. Caller has to test for error using the IS_ERR() macro on the - * return value. If that evaluates to 'true', the negative error code can be - * obtained using PTR_ERR() on the return value of ntfs_map_page(). - */ -static inline struct page *ntfs_map_page(struct address_space *mapping, - unsigned long index) -{ - struct page *page = read_mapping_page(mapping, index, NULL); - - if (!IS_ERR(page)) - kmap(page); - return page; -} - -#ifdef NTFS_RW - -extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_AOPS_H */ diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c deleted file mode 100644 index f79408f9127a..000000000000 --- a/fs/ntfs/attrib.c +++ /dev/null @@ -1,2624 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/buffer_head.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/swap.h> -#include <linux/writeback.h> - -#include "attrib.h" -#include "debug.h" -#include "layout.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "ntfs.h" -#include "types.h" - -/** - * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode - * @ni: ntfs inode for which to map (part of) a runlist - * @vcn: map runlist part containing this vcn - * @ctx: active attribute search context if present or NULL if not - * - * Map the part of a runlist containing the @vcn of the ntfs inode @ni. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped - * runlist fragments and allows their mapping. If you do not have the mft - * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock() - * will perform the necessary mapping and unmapping. - * - * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and - * restores it before returning. Thus, @ctx will be left pointing to the same - * attribute on return as on entry. However, the actual pointers in @ctx may - * point to different memory locations on return, so you must remember to reset - * any cached pointers from the @ctx, i.e. after the call to - * ntfs_map_runlist_nolock(), you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * Return 0 on success and -errno on error. There is one special error code - * which is not an error as such. This is -ENOENT. It means that @vcn is out - * of bounds of the runlist. - * - * Note the runlist can be NULL after this function returns if @vcn is zero and - * the attribute has zero allocated size, i.e. there simply is no runlist. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist will be modified. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) -{ - VCN end_vcn; - unsigned long flags; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - runlist_element *rl; - struct page *put_this_page = NULL; - int err = 0; - bool ctx_is_temporary, ctx_needs_reset; - ntfs_attr_search_ctx old_ctx = { NULL, }; - - ntfs_debug("Mapping runlist part containing vcn 0x%llx.", - (unsigned long long)vcn); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - if (!ctx) { - ctx_is_temporary = ctx_needs_reset = true; - m = map_mft_record(base_ni); - if (IS_ERR(m)) - return PTR_ERR(m); - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - } else { - VCN allocated_size_vcn; - - BUG_ON(IS_ERR(ctx->mrec)); - a = ctx->attr; - BUG_ON(!a->non_resident); - ctx_is_temporary = false; - end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - read_lock_irqsave(&ni->size_lock, flags); - allocated_size_vcn = ni->allocated_size >> - ni->vol->cluster_size_bits; - read_unlock_irqrestore(&ni->size_lock, flags); - if (!a->data.non_resident.lowest_vcn && end_vcn <= 0) - end_vcn = allocated_size_vcn - 1; - /* - * If we already have the attribute extent containing @vcn in - * @ctx, no need to look it up again. We slightly cheat in - * that if vcn exceeds the allocated size, we will refuse to - * map the runlist below, so there is definitely no need to get - * the right attribute extent. - */ - if (vcn >= allocated_size_vcn || (a->type == ni->type && - a->name_length == ni->name_len && - !memcmp((u8*)a + le16_to_cpu(a->name_offset), - ni->name, ni->name_len) && - sle64_to_cpu(a->data.non_resident.lowest_vcn) - <= vcn && end_vcn >= vcn)) - ctx_needs_reset = false; - else { - /* Save the old search context. */ - old_ctx = *ctx; - /* - * If the currently mapped (extent) inode is not the - * base inode we will unmap it when we reinitialize the - * search context which means we need to get a - * reference to the page containing the mapped mft - * record so we do not accidentally drop changes to the - * mft record when it has not been marked dirty yet. - */ - if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino != - old_ctx.base_ntfs_ino) { - put_this_page = old_ctx.ntfs_ino->page; - get_page(put_this_page); - } - /* - * Reinitialize the search context so we can lookup the - * needed attribute extent. - */ - ntfs_attr_reinit_search_ctx(ctx); - ctx_needs_reset = true; - } - } - if (ctx_needs_reset) { - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, vcn, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - BUG_ON(!ctx->attr->non_resident); - } - a = ctx->attr; - /* - * Only decompress the mapping pairs if @vcn is inside it. Otherwise - * we get into problems when we try to map an out of bounds vcn because - * we then try to map the already mapped runlist fragment and - * ntfs_mapping_pairs_decompress() fails. - */ - end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; - if (unlikely(vcn && vcn >= end_vcn)) { - err = -ENOENT; - goto err_out; - } - rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl); - if (IS_ERR(rl)) - err = PTR_ERR(rl); - else - ni->runlist.rl = rl; -err_out: - if (ctx_is_temporary) { - if (likely(ctx)) - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - } else if (ctx_needs_reset) { - /* - * If there is no attribute list, restoring the search context - * is accomplished simply by copying the saved context back over - * the caller supplied context. If there is an attribute list, - * things are more complicated as we need to deal with mapping - * of mft records and resulting potential changes in pointers. - */ - if (NInoAttrList(base_ni)) { - /* - * If the currently mapped (extent) inode is not the - * one we had before, we need to unmap it and map the - * old one. - */ - if (ctx->ntfs_ino != old_ctx.ntfs_ino) { - /* - * If the currently mapped inode is not the - * base inode, unmap it. - */ - if (ctx->base_ntfs_ino && ctx->ntfs_ino != - ctx->base_ntfs_ino) { - unmap_extent_mft_record(ctx->ntfs_ino); - ctx->mrec = ctx->base_mrec; - BUG_ON(!ctx->mrec); - } - /* - * If the old mapped inode is not the base - * inode, map it. - */ - if (old_ctx.base_ntfs_ino && - old_ctx.ntfs_ino != - old_ctx.base_ntfs_ino) { -retry_map: - ctx->mrec = map_mft_record( - old_ctx.ntfs_ino); - /* - * Something bad has happened. If out - * of memory retry till it succeeds. - * Any other errors are fatal and we - * return the error code in ctx->mrec. - * Let the caller deal with it... We - * just need to fudge things so the - * caller can reinit and/or put the - * search context safely. - */ - if (IS_ERR(ctx->mrec)) { - if (PTR_ERR(ctx->mrec) == - -ENOMEM) { - schedule(); - goto retry_map; - } else - old_ctx.ntfs_ino = - old_ctx. - base_ntfs_ino; - } - } - } - /* Update the changed pointers in the saved context. */ - if (ctx->mrec != old_ctx.mrec) { - if (!IS_ERR(ctx->mrec)) - old_ctx.attr = (ATTR_RECORD*)( - (u8*)ctx->mrec + - ((u8*)old_ctx.attr - - (u8*)old_ctx.mrec)); - old_ctx.mrec = ctx->mrec; - } - } - /* Restore the search context to the saved one. */ - *ctx = old_ctx; - /* - * We drop the reference on the page we took earlier. In the - * case that IS_ERR(ctx->mrec) is true this means we might lose - * some changes to the mft record that had been made between - * the last time it was marked dirty/written out and now. This - * at this stage is not a problem as the mapping error is fatal - * enough that the mft record cannot be written out anyway and - * the caller is very likely to shutdown the whole inode - * immediately and mark the volume dirty for chkdsk to pick up - * the pieces anyway. - */ - if (put_this_page) - put_page(put_this_page); - } - return err; -} - -/** - * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode - * @ni: ntfs inode for which to map (part of) a runlist - * @vcn: map runlist part containing this vcn - * - * Map the part of a runlist containing the @vcn of the ntfs inode @ni. - * - * Return 0 on success and -errno on error. There is one special error code - * which is not an error as such. This is -ENOENT. It means that @vcn is out - * of bounds of the runlist. - * - * Locking: - The runlist must be unlocked on entry and is unlocked on return. - * - This function takes the runlist lock for writing and may modify - * the runlist. - */ -int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) -{ - int err = 0; - - down_write(&ni->runlist.lock); - /* Make sure someone else didn't do the work while we were sleeping. */ - if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= - LCN_RL_NOT_MAPPED)) - err = ntfs_map_runlist_nolock(ni, vcn, NULL); - up_write(&ni->runlist.lock); - return err; -} - -/** - * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode - * @ni: ntfs inode of the attribute whose runlist to search - * @vcn: vcn to convert - * @write_locked: true if the runlist is locked for writing - * - * Find the virtual cluster number @vcn in the runlist of the ntfs attribute - * described by the ntfs inode @ni and return the corresponding logical cluster - * number (lcn). - * - * If the @vcn is not mapped yet, the attempt is made to map the attribute - * extent containing the @vcn and the vcn to lcn conversion is retried. - * - * If @write_locked is true the caller has locked the runlist for writing and - * if false for reading. - * - * Since lcns must be >= 0, we use negative return codes with special meaning: - * - * Return code Meaning / Description - * ========================================== - * LCN_HOLE Hole / not allocated on disk. - * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds. - * LCN_ENOMEM Not enough memory to map runlist. - * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc). - * - * Locking: - The runlist must be locked on entry and is left locked on return. - * - If @write_locked is 'false', i.e. the runlist is locked for reading, - * the lock may be dropped inside the function so you cannot rely on - * the runlist still being the same when this function returns. - */ -LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, - const bool write_locked) -{ - LCN lcn; - unsigned long flags; - bool is_retry = false; - - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", - ni->mft_no, (unsigned long long)vcn, - write_locked ? "write" : "read"); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(vcn < 0); - if (!ni->runlist.rl) { - read_lock_irqsave(&ni->size_lock, flags); - if (!ni->allocated_size) { - read_unlock_irqrestore(&ni->size_lock, flags); - return LCN_ENOENT; - } - read_unlock_irqrestore(&ni->size_lock, flags); - } -retry_remap: - /* Convert vcn to lcn. If that fails map the runlist and retry once. */ - lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn); - if (likely(lcn >= LCN_HOLE)) { - ntfs_debug("Done, lcn 0x%llx.", (long long)lcn); - return lcn; - } - if (lcn != LCN_RL_NOT_MAPPED) { - if (lcn != LCN_ENOENT) - lcn = LCN_EIO; - } else if (!is_retry) { - int err; - - if (!write_locked) { - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != - LCN_RL_NOT_MAPPED)) { - up_write(&ni->runlist.lock); - down_read(&ni->runlist.lock); - goto retry_remap; - } - } - err = ntfs_map_runlist_nolock(ni, vcn, NULL); - if (!write_locked) { - up_write(&ni->runlist.lock); - down_read(&ni->runlist.lock); - } - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - if (err == -ENOENT) - lcn = LCN_ENOENT; - else if (err == -ENOMEM) - lcn = LCN_ENOMEM; - else - lcn = LCN_EIO; - } - if (lcn != LCN_ENOENT) - ntfs_error(ni->vol->sb, "Failed with error code %lli.", - (long long)lcn); - return lcn; -} - -/** - * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode - * @ni: ntfs inode describing the runlist to search - * @vcn: vcn to find - * @ctx: active attribute search context if present or NULL if not - * - * Find the virtual cluster number @vcn in the runlist described by the ntfs - * inode @ni and return the address of the runlist element containing the @vcn. - * - * If the @vcn is not mapped yet, the attempt is made to map the attribute - * extent containing the @vcn and the vcn to lcn conversion is retried. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped - * runlist fragments and allows their mapping. If you do not have the mft - * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock() - * will perform the necessary mapping and unmapping. - * - * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and - * restores it before returning. Thus, @ctx will be left pointing to the same - * attribute on return as on entry. However, the actual pointers in @ctx may - * point to different memory locations on return, so you must remember to reset - * any cached pointers from the @ctx, i.e. after the call to - * ntfs_attr_find_vcn_nolock(), you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * Note you need to distinguish between the lcn of the returned runlist element - * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on - * read and allocate clusters on write. - * - * Return the runlist element containing the @vcn on success and - * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR() - * to decide if the return is success or failure and PTR_ERR() to get to the - * error code if IS_ERR() is true. - * - * The possible error return codes are: - * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds. - * -ENOMEM - Not enough memory to map runlist. - * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, - ntfs_attr_search_ctx *ctx) -{ - unsigned long flags; - runlist_element *rl; - int err = 0; - bool is_retry = false; - - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", - ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(vcn < 0); - if (!ni->runlist.rl) { - read_lock_irqsave(&ni->size_lock, flags); - if (!ni->allocated_size) { - read_unlock_irqrestore(&ni->size_lock, flags); - return ERR_PTR(-ENOENT); - } - read_unlock_irqrestore(&ni->size_lock, flags); - } -retry_remap: - rl = ni->runlist.rl; - if (likely(rl && vcn >= rl[0].vcn)) { - while (likely(rl->length)) { - if (unlikely(vcn < rl[1].vcn)) { - if (likely(rl->lcn >= LCN_HOLE)) { - ntfs_debug("Done."); - return rl; - } - break; - } - rl++; - } - if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) { - if (likely(rl->lcn == LCN_ENOENT)) - err = -ENOENT; - else - err = -EIO; - } - } - if (!err && !is_retry) { - /* - * If the search context is invalid we cannot map the unmapped - * region. - */ - if (IS_ERR(ctx->mrec)) - err = PTR_ERR(ctx->mrec); - else { - /* - * The @vcn is in an unmapped region, map the runlist - * and retry. - */ - err = ntfs_map_runlist_nolock(ni, vcn, ctx); - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - } - if (err == -EINVAL) - err = -EIO; - } else if (!err) - err = -EIO; - if (err != -ENOENT) - ntfs_error(ni->vol->sb, "Failed with error code %i.", err); - return ERR_PTR(err); -} - -/** - * ntfs_attr_find - find (next) attribute in mft record - * @type: attribute type to find - * @name: attribute name to find (optional, i.e. NULL means don't care) - * @name_len: attribute name length (only needed if @name present) - * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) - * @val: attribute value to find (optional, resident attributes only) - * @val_len: attribute value length - * @ctx: search context with mft record and attribute to search from - * - * You should not need to call this function directly. Use ntfs_attr_lookup() - * instead. - * - * ntfs_attr_find() takes a search context @ctx as parameter and searches the - * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an - * attribute of @type, optionally @name and @val. - * - * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will - * point to the found attribute. - * - * If the attribute is not found, ntfs_attr_find() returns -ENOENT and - * @ctx->attr will point to the attribute before which the attribute being - * searched for would need to be inserted if such an action were to be desired. - * - * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is - * undefined and in particular do not rely on it not changing. - * - * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it - * is 'false', the search begins after @ctx->attr. - * - * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and - * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record - * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at - * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case - * sensitive. When @name is present, @name_len is the @name length in Unicode - * characters. - * - * If @name is not present (NULL), we assume that the unnamed attribute is - * being searched for. - * - * Finally, the resident attribute value @val is looked for, if present. If - * @val is not present (NULL), @val_len is ignored. - * - * ntfs_attr_find() only searches the specified mft record and it ignores the - * presence of an attribute list attribute (unless it is the one being searched - * for, obviously). If you need to take attribute lists into consideration, - * use ntfs_attr_lookup() instead (see below). This also means that you cannot - * use ntfs_attr_find() to search for extent records of non-resident - * attributes, as extents with lowest_vcn != 0 are usually described by the - * attribute list attribute only. - Note that it is possible that the first - * extent is only in the attribute list while the last extent is in the base - * mft record, so do not rely on being able to find the first extent in the - * base mft record. - * - * Warning: Never use @val when looking for attribute types which can be - * non-resident as this most likely will result in a crash! - */ -static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) -{ - ATTR_RECORD *a; - ntfs_volume *vol = ctx->ntfs_ino->vol; - ntfschar *upcase = vol->upcase; - u32 upcase_len = vol->upcase_len; - - /* - * Iterate over attributes in mft record starting at @ctx->attr, or the - * attribute following that, if @ctx->is_first is 'true'. - */ - if (ctx->is_first) { - a = ctx->attr; - ctx->is_first = false; - } else - a = (ATTR_RECORD*)((u8*)ctx->attr + - le32_to_cpu(ctx->attr->length)); - for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - u8 *mrec_end = (u8 *)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end; - - /* check whether ATTR_RECORD wrap */ - if ((u8 *)a < (u8 *)ctx->mrec) - break; - - /* check whether Attribute Record Header is within bounds */ - if ((u8 *)a > mrec_end || - (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) - break; - - /* check whether ATTR_RECORD's name is within bounds */ - name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if (name_end > mrec_end) - break; - - ctx->attr = a; - if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || - a->type == AT_END)) - return -ENOENT; - if (unlikely(!a->length)) - break; - - /* check whether ATTR_RECORD's length wrap */ - if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) - break; - /* check whether ATTR_RECORD's length is within bounds */ - if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) - break; - - if (a->type != type) - continue; - /* - * If @name is present, compare the two names. If @name is - * missing, assume we want an unnamed attribute. - */ - if (!name) { - /* The search failed if the found attribute is named. */ - if (a->name_length) - return -ENOENT; - } else if (!ntfs_are_names_equal(name, name_len, - (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)), - a->name_length, ic, upcase, upcase_len)) { - register int rc; - - rc = ntfs_collate_names(name, name_len, - (ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), - a->name_length, 1, IGNORE_CASE, - upcase, upcase_len); - /* - * If @name collates before a->name, there is no - * matching attribute. - */ - if (rc == -1) - return -ENOENT; - /* If the strings are not equal, continue search. */ - if (rc) - continue; - rc = ntfs_collate_names(name, name_len, - (ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), - a->name_length, 1, CASE_SENSITIVE, - upcase, upcase_len); - if (rc == -1) - return -ENOENT; - if (rc) - continue; - } - /* - * The names match or @name not present and attribute is - * unnamed. If no @val specified, we have found the attribute - * and are done. - */ - if (!val) - return 0; - /* @val is present; compare values. */ - else { - register int rc; - - rc = memcmp(val, (u8*)a + le16_to_cpu( - a->data.resident.value_offset), - min_t(u32, val_len, le32_to_cpu( - a->data.resident.value_length))); - /* - * If @val collates before the current attribute's - * value, there is no matching attribute. - */ - if (!rc) { - register u32 avl; - - avl = le32_to_cpu( - a->data.resident.value_length); - if (val_len == avl) - return 0; - if (val_len < avl) - return -ENOENT; - } else if (rc < 0) - return -ENOENT; - } - } - ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk."); - NVolSetErrors(vol); - return -EIO; -} - -/** - * load_attribute_list - load an attribute list into memory - * @vol: ntfs volume from which to read - * @runlist: runlist of the attribute list - * @al_start: destination buffer - * @size: size of the destination buffer in bytes - * @initialized_size: initialized size of the attribute list - * - * Walk the runlist @runlist and load all clusters from it copying them into - * the linear buffer @al. The maximum number of bytes copied to @al is @size - * bytes. Note, @size does not need to be a multiple of the cluster size. If - * @initialized_size is less than @size, the region in @al between - * @initialized_size and @size will be zeroed and not read from disk. - * - * Return 0 on success or -errno on error. - */ -int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start, - const s64 size, const s64 initialized_size) -{ - LCN lcn; - u8 *al = al_start; - u8 *al_end = al + initialized_size; - runlist_element *rl; - struct buffer_head *bh; - struct super_block *sb; - unsigned long block_size; - unsigned long block, max_block; - int err = 0; - unsigned char block_size_bits; - - ntfs_debug("Entering."); - if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 || - initialized_size > size) - return -EINVAL; - if (!initialized_size) { - memset(al, 0, size); - return 0; - } - sb = vol->sb; - block_size = sb->s_blocksize; - block_size_bits = sb->s_blocksize_bits; - down_read(&runlist->lock); - rl = runlist->rl; - if (!rl) { - ntfs_error(sb, "Cannot read attribute list since runlist is " - "missing."); - goto err_out; - } - /* Read all clusters specified by the runlist one run at a time. */ - while (rl->length) { - lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn); - ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", - (unsigned long long)rl->vcn, - (unsigned long long)lcn); - /* The attribute list cannot be sparse. */ - if (lcn < 0) { - ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot " - "read attribute list."); - goto err_out; - } - block = lcn << vol->cluster_size_bits >> block_size_bits; - /* Read the run from device in chunks of block_size bytes. */ - max_block = block + (rl->length << vol->cluster_size_bits >> - block_size_bits); - ntfs_debug("max_block = 0x%lx.", max_block); - do { - ntfs_debug("Reading block = 0x%lx.", block); - bh = sb_bread(sb, block); - if (!bh) { - ntfs_error(sb, "sb_bread() failed. Cannot " - "read attribute list."); - goto err_out; - } - if (al + block_size >= al_end) - goto do_final; - memcpy(al, bh->b_data, block_size); - brelse(bh); - al += block_size; - } while (++block < max_block); - rl++; - } - if (initialized_size < size) { -initialize: - memset(al_start + initialized_size, 0, size - initialized_size); - } -done: - up_read(&runlist->lock); - return err; -do_final: - if (al < al_end) { - /* - * Partial block. - * - * Note: The attribute list can be smaller than its allocation - * by multiple clusters. This has been encountered by at least - * two people running Windows XP, thus we cannot do any - * truncation sanity checking here. (AIA) - */ - memcpy(al, bh->b_data, al_end - al); - brelse(bh); - if (initialized_size < size) - goto initialize; - goto done; - } - brelse(bh); - /* Real overflow! */ - ntfs_error(sb, "Attribute list buffer overflow. Read attribute list " - "is truncated."); -err_out: - err = -EIO; - goto done; -} - -/** - * ntfs_external_attr_find - find an attribute in the attribute list of an inode - * @type: attribute type to find - * @name: attribute name to find (optional, i.e. NULL means don't care) - * @name_len: attribute name length (only needed if @name present) - * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) - * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) - * @val: attribute value to find (optional, resident attributes only) - * @val_len: attribute value length - * @ctx: search context with mft record and attribute to search from - * - * You should not need to call this function directly. Use ntfs_attr_lookup() - * instead. - * - * Find an attribute by searching the attribute list for the corresponding - * attribute list entry. Having found the entry, map the mft record if the - * attribute is in a different mft record/inode, ntfs_attr_find() the attribute - * in there and return it. - * - * On first search @ctx->ntfs_ino must be the base mft record and @ctx must - * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent - * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is - * then the base inode). - * - * After finishing with the attribute/mft record you need to call - * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any - * mapped inodes, etc). - * - * If the attribute is found, ntfs_external_attr_find() returns 0 and - * @ctx->attr will point to the found attribute. @ctx->mrec will point to the - * mft record in which @ctx->attr is located and @ctx->al_entry will point to - * the attribute list entry for the attribute. - * - * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and - * @ctx->attr will point to the attribute in the base mft record before which - * the attribute being searched for would need to be inserted if such an action - * were to be desired. @ctx->mrec will point to the mft record in which - * @ctx->attr is located and @ctx->al_entry will point to the attribute list - * entry of the attribute before which the attribute being searched for would - * need to be inserted if such an action were to be desired. - * - * Thus to insert the not found attribute, one wants to add the attribute to - * @ctx->mrec (the base mft record) and if there is not enough space, the - * attribute should be placed in a newly allocated extent mft record. The - * attribute list entry for the inserted attribute should be inserted in the - * attribute list attribute at @ctx->al_entry. - * - * On actual error, ntfs_external_attr_find() returns -EIO. In this case - * @ctx->attr is undefined and in particular do not rely on it not changing. - */ -static int ntfs_external_attr_find(const ATTR_TYPE type, - const ntfschar *name, const u32 name_len, - const IGNORE_CASE_BOOL ic, const VCN lowest_vcn, - const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) -{ - ntfs_inode *base_ni, *ni; - ntfs_volume *vol; - ATTR_LIST_ENTRY *al_entry, *next_al_entry; - u8 *al_start, *al_end; - ATTR_RECORD *a; - ntfschar *al_name; - u32 al_name_len; - int err = 0; - static const char *es = " Unmount and run chkdsk."; - - ni = ctx->ntfs_ino; - base_ni = ctx->base_ntfs_ino; - ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type); - if (!base_ni) { - /* First call happens with the base mft record. */ - base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino; - ctx->base_mrec = ctx->mrec; - } - if (ni == base_ni) - ctx->base_attr = ctx->attr; - if (type == AT_END) - goto not_found; - vol = base_ni->vol; - al_start = base_ni->attr_list; - al_end = al_start + base_ni->attr_list_size; - if (!ctx->al_entry) - ctx->al_entry = (ATTR_LIST_ENTRY*)al_start; - /* - * Iterate over entries in attribute list starting at @ctx->al_entry, - * or the entry following that, if @ctx->is_first is 'true'. - */ - if (ctx->is_first) { - al_entry = ctx->al_entry; - ctx->is_first = false; - } else - al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry + - le16_to_cpu(ctx->al_entry->length)); - for (;; al_entry = next_al_entry) { - /* Out of bounds check. */ - if ((u8*)al_entry < base_ni->attr_list || - (u8*)al_entry > al_end) - break; /* Inode is corrupt. */ - ctx->al_entry = al_entry; - /* Catch the end of the attribute list. */ - if ((u8*)al_entry == al_end) - goto not_found; - if (!al_entry->length) - break; - if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + - le16_to_cpu(al_entry->length) > al_end) - break; - next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + - le16_to_cpu(al_entry->length)); - if (le32_to_cpu(al_entry->type) > le32_to_cpu(type)) - goto not_found; - if (type != al_entry->type) - continue; - /* - * If @name is present, compare the two names. If @name is - * missing, assume we want an unnamed attribute. - */ - al_name_len = al_entry->name_length; - al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset); - if (!name) { - if (al_name_len) - goto not_found; - } else if (!ntfs_are_names_equal(al_name, al_name_len, name, - name_len, ic, vol->upcase, vol->upcase_len)) { - register int rc; - - rc = ntfs_collate_names(name, name_len, al_name, - al_name_len, 1, IGNORE_CASE, - vol->upcase, vol->upcase_len); - /* - * If @name collates before al_name, there is no - * matching attribute. - */ - if (rc == -1) - goto not_found; - /* If the strings are not equal, continue search. */ - if (rc) - continue; - /* - * FIXME: Reverse engineering showed 0, IGNORE_CASE but - * that is inconsistent with ntfs_attr_find(). The - * subsequent rc checks were also different. Perhaps I - * made a mistake in one of the two. Need to recheck - * which is correct or at least see what is going on... - * (AIA) - */ - rc = ntfs_collate_names(name, name_len, al_name, - al_name_len, 1, CASE_SENSITIVE, - vol->upcase, vol->upcase_len); - if (rc == -1) - goto not_found; - if (rc) - continue; - } - /* - * The names match or @name not present and attribute is - * unnamed. Now check @lowest_vcn. Continue search if the - * next attribute list entry still fits @lowest_vcn. Otherwise - * we have reached the right one or the search has failed. - */ - if (lowest_vcn && (u8*)next_al_entry >= al_start && - (u8*)next_al_entry + 6 < al_end && - (u8*)next_al_entry + le16_to_cpu( - next_al_entry->length) <= al_end && - sle64_to_cpu(next_al_entry->lowest_vcn) <= - lowest_vcn && - next_al_entry->type == al_entry->type && - next_al_entry->name_length == al_name_len && - ntfs_are_names_equal((ntfschar*)((u8*) - next_al_entry + - next_al_entry->name_offset), - next_al_entry->name_length, - al_name, al_name_len, CASE_SENSITIVE, - vol->upcase, vol->upcase_len)) - continue; - if (MREF_LE(al_entry->mft_reference) == ni->mft_no) { - if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) { - ntfs_error(vol->sb, "Found stale mft " - "reference in attribute list " - "of base inode 0x%lx.%s", - base_ni->mft_no, es); - err = -EIO; - break; - } - } else { /* Mft references do not match. */ - /* If there is a mapped record unmap it first. */ - if (ni != base_ni) - unmap_extent_mft_record(ni); - /* Do we want the base record back? */ - if (MREF_LE(al_entry->mft_reference) == - base_ni->mft_no) { - ni = ctx->ntfs_ino = base_ni; - ctx->mrec = ctx->base_mrec; - } else { - /* We want an extent record. */ - ctx->mrec = map_extent_mft_record(base_ni, - le64_to_cpu( - al_entry->mft_reference), &ni); - if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to map " - "extent mft record " - "0x%lx of base inode " - "0x%lx.%s", - MREF_LE(al_entry-> - mft_reference), - base_ni->mft_no, es); - err = PTR_ERR(ctx->mrec); - if (err == -ENOENT) - err = -EIO; - /* Cause @ctx to be sanitized below. */ - ni = NULL; - break; - } - ctx->ntfs_ino = ni; - } - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); - } - /* - * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the - * mft record containing the attribute represented by the - * current al_entry. - */ - /* - * We could call into ntfs_attr_find() to find the right - * attribute in this mft record but this would be less - * efficient and not quite accurate as ntfs_attr_find() ignores - * the attribute instance numbers for example which become - * important when one plays with attribute lists. Also, - * because a proper match has been found in the attribute list - * entry above, the comparison can now be optimized. So it is - * worth re-implementing a simplified ntfs_attr_find() here. - */ - a = ctx->attr; - /* - * Use a manual loop so we can still use break and continue - * with the same meanings as above. - */ -do_next_attr_loop: - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated)) - break; - if (a->type == AT_END) - break; - if (!a->length) - break; - if (al_entry->instance != a->instance) - goto do_next_attr; - /* - * If the type and/or the name are mismatched between the - * attribute list entry and the attribute record, there is - * corruption so we break and return error EIO. - */ - if (al_entry->type != a->type) - break; - if (!ntfs_are_names_equal((ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), a->name_length, - al_name, al_name_len, CASE_SENSITIVE, - vol->upcase, vol->upcase_len)) - break; - ctx->attr = a; - /* - * If no @val specified or @val specified and it matches, we - * have found it! - */ - if (!val || (!a->non_resident && le32_to_cpu( - a->data.resident.value_length) == val_len && - !memcmp((u8*)a + - le16_to_cpu(a->data.resident.value_offset), - val, val_len))) { - ntfs_debug("Done, found."); - return 0; - } -do_next_attr: - /* Proceed to the next attribute in the current mft record. */ - a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length)); - goto do_next_attr_loop; - } - if (!err) { - ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt " - "attribute list attribute.%s", base_ni->mft_no, - es); - err = -EIO; - } - if (ni != base_ni) { - if (ni) - unmap_extent_mft_record(ni); - ctx->ntfs_ino = base_ni; - ctx->mrec = ctx->base_mrec; - ctx->attr = ctx->base_attr; - } - if (err != -ENOMEM) - NVolSetErrors(vol); - return err; -not_found: - /* - * If we were looking for AT_END, we reset the search context @ctx and - * use ntfs_attr_find() to seek to the end of the base mft record. - */ - if (type == AT_END) { - ntfs_attr_reinit_search_ctx(ctx); - return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len, - ctx); - } - /* - * The attribute was not found. Before we return, we want to ensure - * @ctx->mrec and @ctx->attr indicate the position at which the - * attribute should be inserted in the base mft record. Since we also - * want to preserve @ctx->al_entry we cannot reinitialize the search - * context using ntfs_attr_reinit_search_ctx() as this would set - * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see - * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve - * @ctx->al_entry as the remaining fields (base_*) are identical to - * their non base_ counterparts and we cannot set @ctx->base_attr - * correctly yet as we do not know what @ctx->attr will be set to by - * the call to ntfs_attr_find() below. - */ - if (ni != base_ni) - unmap_extent_mft_record(ni); - ctx->mrec = ctx->base_mrec; - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); - ctx->is_first = true; - ctx->ntfs_ino = base_ni; - ctx->base_ntfs_ino = NULL; - ctx->base_mrec = NULL; - ctx->base_attr = NULL; - /* - * In case there are multiple matches in the base mft record, need to - * keep enumerating until we get an attribute not found response (or - * another error), otherwise we would keep returning the same attribute - * over and over again and all programs using us for enumeration would - * lock up in a tight loop. - */ - do { - err = ntfs_attr_find(type, name, name_len, ic, val, val_len, - ctx); - } while (!err); - ntfs_debug("Done, not found."); - return err; -} - -/** - * ntfs_attr_lookup - find an attribute in an ntfs inode - * @type: attribute type to find - * @name: attribute name to find (optional, i.e. NULL means don't care) - * @name_len: attribute name length (only needed if @name present) - * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) - * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) - * @val: attribute value to find (optional, resident attributes only) - * @val_len: attribute value length - * @ctx: search context with mft record and attribute to search from - * - * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must - * be the base mft record and @ctx must have been obtained from a call to - * ntfs_attr_get_search_ctx(). - * - * This function transparently handles attribute lists and @ctx is used to - * continue searches where they were left off at. - * - * After finishing with the attribute/mft record you need to call - * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any - * mapped inodes, etc). - * - * Return 0 if the search was successful and -errno if not. - * - * When 0, @ctx->attr is the found attribute and it is in mft record - * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is - * the attribute list entry of the found attribute. - * - * When -ENOENT, @ctx->attr is the attribute which collates just after the - * attribute being searched for, i.e. if one wants to add the attribute to the - * mft record this is the correct place to insert it into. If an attribute - * list attribute is present, @ctx->al_entry is the attribute list entry which - * collates just after the attribute list entry of the attribute being searched - * for, i.e. if one wants to add the attribute to the mft record this is the - * correct place to insert its attribute list entry into. - * - * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is - * then undefined and in particular you should not rely on it not changing. - */ -int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const VCN lowest_vcn, const u8 *val, const u32 val_len, - ntfs_attr_search_ctx *ctx) -{ - ntfs_inode *base_ni; - - ntfs_debug("Entering."); - BUG_ON(IS_ERR(ctx->mrec)); - if (ctx->base_ntfs_ino) - base_ni = ctx->base_ntfs_ino; - else - base_ni = ctx->ntfs_ino; - /* Sanity check, just for debugging really. */ - BUG_ON(!base_ni); - if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST) - return ntfs_attr_find(type, name, name_len, ic, val, val_len, - ctx); - return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn, - val, val_len, ctx); -} - -/** - * ntfs_attr_init_search_ctx - initialize an attribute search context - * @ctx: attribute search context to initialize - * @ni: ntfs inode with which to initialize the search context - * @mrec: mft record with which to initialize the search context - * - * Initialize the attribute search context @ctx with @ni and @mrec. - */ -static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, - ntfs_inode *ni, MFT_RECORD *mrec) -{ - *ctx = (ntfs_attr_search_ctx) { - .mrec = mrec, - /* Sanity checks are performed elsewhere. */ - .attr = (ATTR_RECORD*)((u8*)mrec + - le16_to_cpu(mrec->attrs_offset)), - .is_first = true, - .ntfs_ino = ni, - }; -} - -/** - * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context - * @ctx: attribute search context to reinitialize - * - * Reinitialize the attribute search context @ctx, unmapping an associated - * extent mft record if present, and initialize the search context again. - * - * This is used when a search for a new attribute is being started to reset - * the search context to the beginning. - */ -void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) -{ - if (likely(!ctx->base_ntfs_ino)) { - /* No attribute list. */ - ctx->is_first = true; - /* Sanity checks are performed elsewhere. */ - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); - /* - * This needs resetting due to ntfs_external_attr_find() which - * can leave it set despite having zeroed ctx->base_ntfs_ino. - */ - ctx->al_entry = NULL; - return; - } /* Attribute list. */ - if (ctx->ntfs_ino != ctx->base_ntfs_ino) - unmap_extent_mft_record(ctx->ntfs_ino); - ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec); - return; -} - -/** - * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context - * @ni: ntfs inode with which to initialize the search context - * @mrec: mft record with which to initialize the search context - * - * Allocate a new attribute search context, initialize it with @ni and @mrec, - * and return it. Return NULL if allocation failed. - */ -ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec) -{ - ntfs_attr_search_ctx *ctx; - - ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS); - if (ctx) - ntfs_attr_init_search_ctx(ctx, ni, mrec); - return ctx; -} - -/** - * ntfs_attr_put_search_ctx - release an attribute search context - * @ctx: attribute search context to free - * - * Release the attribute search context @ctx, unmapping an associated extent - * mft record if present. - */ -void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) -{ - if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino) - unmap_extent_mft_record(ctx->ntfs_ino); - kmem_cache_free(ntfs_attr_ctx_cache, ctx); - return; -} - -#ifdef NTFS_RW - -/** - * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to find - * - * Search for the attribute definition record corresponding to the attribute - * @type in the $AttrDef system file. - * - * Return the attribute type definition record if found and NULL if not found. - */ -static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol, - const ATTR_TYPE type) -{ - ATTR_DEF *ad; - - BUG_ON(!vol->attrdef); - BUG_ON(!type); - for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef < - vol->attrdef_size && ad->type; ++ad) { - /* We have not found it yet, carry on searching. */ - if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type))) - continue; - /* We found the attribute; return it. */ - if (likely(ad->type == type)) - return ad; - /* We have gone too far already. No point in continuing. */ - break; - } - /* Attribute not found. */ - ntfs_debug("Attribute type 0x%x not found in $AttrDef.", - le32_to_cpu(type)); - return NULL; -} - -/** - * ntfs_attr_size_bounds_check - check a size of an attribute type for validity - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to check - * @size: size which to check - * - * Check whether the @size in bytes is valid for an attribute of @type on the - * ntfs volume @vol. This information is obtained from $AttrDef system file. - * - * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not - * listed in $AttrDef. - */ -int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, - const s64 size) -{ - ATTR_DEF *ad; - - BUG_ON(size < 0); - /* - * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not - * listed in $AttrDef. - */ - if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024)) - return -ERANGE; - /* Get the $AttrDef entry for the attribute @type. */ - ad = ntfs_attr_find_in_attrdef(vol, type); - if (unlikely(!ad)) - return -ENOENT; - /* Do the bounds check. */ - if (((sle64_to_cpu(ad->min_size) > 0) && - size < sle64_to_cpu(ad->min_size)) || - ((sle64_to_cpu(ad->max_size) > 0) && size > - sle64_to_cpu(ad->max_size))) - return -ERANGE; - return 0; -} - -/** - * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to check - * - * Check whether the attribute of @type on the ntfs volume @vol is allowed to - * be non-resident. This information is obtained from $AttrDef system file. - * - * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and - * -ENOENT if the attribute is not listed in $AttrDef. - */ -int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) -{ - ATTR_DEF *ad; - - /* Find the attribute definition record in $AttrDef. */ - ad = ntfs_attr_find_in_attrdef(vol, type); - if (unlikely(!ad)) - return -ENOENT; - /* Check the flags and return the result. */ - if (ad->flags & ATTR_DEF_RESIDENT) - return -EPERM; - return 0; -} - -/** - * ntfs_attr_can_be_resident - check if an attribute can be resident - * @vol: ntfs volume to which the attribute belongs - * @type: attribute type which to check - * - * Check whether the attribute of @type on the ntfs volume @vol is allowed to - * be resident. This information is derived from our ntfs knowledge and may - * not be completely accurate, especially when user defined attributes are - * present. Basically we allow everything to be resident except for index - * allocation and $EA attributes. - * - * Return 0 if the attribute is allowed to be non-resident and -EPERM if not. - * - * Warning: In the system file $MFT the attribute $Bitmap must be non-resident - * otherwise windows will not boot (blue screen of death)! We cannot - * check for this here as we do not know which inode's $Bitmap is - * being asked about so the caller needs to special case this. - */ -int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) -{ - if (type == AT_INDEX_ALLOCATION) - return -EPERM; - return 0; -} - -/** - * ntfs_attr_record_resize - resize an attribute record - * @m: mft record containing attribute record - * @a: attribute record to resize - * @new_size: new size in bytes to which to resize the attribute record @a - * - * Resize the attribute record @a, i.e. the resident part of the attribute, in - * the mft record @m to @new_size bytes. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -ENOSPC - Not enough space in the mft record @m to perform the resize. - * - * Note: On error, no modifications have been performed whatsoever. - * - * Warning: If you make a record smaller without having copied all the data you - * are interested in the data may be overwritten. - */ -int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) -{ - ntfs_debug("Entering for new_size %u.", new_size); - /* Align to 8 bytes if it is not already done. */ - if (new_size & 7) - new_size = (new_size + 7) & ~7; - /* If the actual attribute length has changed, move things around. */ - if (new_size != le32_to_cpu(a->length)) { - u32 new_muse = le32_to_cpu(m->bytes_in_use) - - le32_to_cpu(a->length) + new_size; - /* Not enough space in this mft record. */ - if (new_muse > le32_to_cpu(m->bytes_allocated)) - return -ENOSPC; - /* Move attributes following @a to their new location. */ - memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length), - le32_to_cpu(m->bytes_in_use) - ((u8*)a - - (u8*)m) - le32_to_cpu(a->length)); - /* Adjust @m to reflect the change in used space. */ - m->bytes_in_use = cpu_to_le32(new_muse); - /* Adjust @a to reflect the new size. */ - if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length)) - a->length = cpu_to_le32(new_size); - } - return 0; -} - -/** - * ntfs_resident_attr_value_resize - resize the value of a resident attribute - * @m: mft record containing attribute record - * @a: attribute record whose value to resize - * @new_size: new size in bytes to which to resize the attribute value of @a - * - * Resize the value of the attribute @a in the mft record @m to @new_size bytes. - * If the value is made bigger, the newly allocated space is cleared. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -ENOSPC - Not enough space in the mft record @m to perform the resize. - * - * Note: On error, no modifications have been performed whatsoever. - * - * Warning: If you make a record smaller without having copied all the data you - * are interested in the data may be overwritten. - */ -int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, - const u32 new_size) -{ - u32 old_size; - - /* Resize the resident part of the attribute record. */ - if (ntfs_attr_record_resize(m, a, - le16_to_cpu(a->data.resident.value_offset) + new_size)) - return -ENOSPC; - /* - * The resize succeeded! If we made the attribute value bigger, clear - * the area between the old size and @new_size. - */ - old_size = le32_to_cpu(a->data.resident.value_length); - if (new_size > old_size) - memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) + - old_size, 0, new_size - old_size); - /* Finally update the length of the attribute value. */ - a->data.resident.value_length = cpu_to_le32(new_size); - return 0; -} - -/** - * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute - * @ni: ntfs inode describing the attribute to convert - * @data_size: size of the resident data to copy to the non-resident attribute - * - * Convert the resident ntfs attribute described by the ntfs inode @ni to a - * non-resident one. - * - * @data_size must be equal to the attribute value size. This is needed since - * we need to know the size before we can map the mft record and our callers - * always know it. The reason we cannot simply read the size from the vfs - * inode i_size is that this is not necessarily uptodate. This happens when - * ntfs_attr_make_non_resident() is called in the ->truncate call path(s). - * - * Return 0 on success and -errno on error. The following error return codes - * are defined: - * -EPERM - The attribute is not allowed to be non-resident. - * -ENOMEM - Not enough memory. - * -ENOSPC - Not enough disk space. - * -EINVAL - Attribute not defined on the volume. - * -EIO - I/o error or other error. - * Note that -ENOSPC is also returned in the case that there is not enough - * space in the mft record to do the conversion. This can happen when the mft - * record is already very full. The caller is responsible for trying to make - * space in the mft record and trying again. FIXME: Do we need a separate - * error return code for this kind of -ENOSPC or is it always worth trying - * again in case the attribute may then fit in a resident state so no need to - * make it non-resident at all? Ho-hum... (AIA) - * - * NOTE to self: No changes in the attribute list are required to move from - * a resident to a non-resident attribute. - * - * Locking: - The caller must hold i_mutex on the inode. - */ -int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) -{ - s64 new_size; - struct inode *vi = VFS_I(ni); - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - struct page *page; - runlist_element *rl; - u8 *kaddr; - unsigned long flags; - int mp_size, mp_ofs, name_ofs, arec_size, err, err2; - u32 attr_size; - u8 old_res_attr_flags; - - /* Check that the attribute is allowed to be non-resident. */ - err = ntfs_attr_can_be_non_resident(vol, ni->type); - if (unlikely(err)) { - if (err == -EPERM) - ntfs_debug("Attribute is not allowed to be " - "non-resident."); - else - ntfs_debug("Attribute not defined on the NTFS " - "volume!"); - return err; - } - /* - * FIXME: Compressed and encrypted attributes are not supported when - * writing and we should never have gotten here for them. - */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoEncrypted(ni)); - /* - * The size needs to be aligned to a cluster boundary for allocation - * purposes. - */ - new_size = (data_size + vol->cluster_size - 1) & - ~(vol->cluster_size - 1); - if (new_size > 0) { - /* - * Will need the page later and since the page lock nests - * outside all ntfs locks, we need to get the page now. - */ - page = find_or_create_page(vi->i_mapping, 0, - mapping_gfp_mask(vi->i_mapping)); - if (unlikely(!page)) - return -ENOMEM; - /* Start by allocating clusters to hold the attribute value. */ - rl = ntfs_cluster_alloc(vol, 0, new_size >> - vol->cluster_size_bits, -1, DATA_ZONE, true); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - ntfs_debug("Failed to allocate cluster%s, error code " - "%i.", (new_size >> - vol->cluster_size_bits) > 1 ? "s" : "", - err); - goto page_err_out; - } - } else { - rl = NULL; - page = NULL; - } - /* Determine the size of the mapping pairs array. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1); - if (unlikely(mp_size < 0)) { - err = mp_size; - ntfs_debug("Failed to get size for mapping pairs array, error " - "code %i.", err); - goto rl_err_out; - } - down_write(&ni->runlist.lock); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(NInoNonResident(ni)); - BUG_ON(a->non_resident); - /* - * Calculate new offsets for the name and the mapping pairs array. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) - name_ofs = (offsetof(ATTR_REC, - data.non_resident.compressed_size) + - sizeof(a->data.non_resident.compressed_size) + - 7) & ~7; - else - name_ofs = (offsetof(ATTR_REC, - data.non_resident.compressed_size) + 7) & ~7; - mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; - /* - * Determine the size of the resident part of the now non-resident - * attribute record. - */ - arec_size = (mp_ofs + mp_size + 7) & ~7; - /* - * If the page is not uptodate bring it uptodate by copying from the - * attribute value. - */ - attr_size = le32_to_cpu(a->data.resident.value_length); - BUG_ON(attr_size != data_size); - if (page && !PageUptodate(page)) { - kaddr = kmap_atomic(page); - memcpy(kaddr, (u8*)a + - le16_to_cpu(a->data.resident.value_offset), - attr_size); - memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size); - kunmap_atomic(kaddr); - flush_dcache_page(page); - SetPageUptodate(page); - } - /* Backup the attribute flag. */ - old_res_attr_flags = a->data.resident.flags; - /* Resize the resident part of the attribute record. */ - err = ntfs_attr_record_resize(m, a, arec_size); - if (unlikely(err)) - goto err_out; - /* - * Convert the resident part of the attribute record to describe a - * non-resident attribute. - */ - a->non_resident = 1; - /* Move the attribute name if it exists and update the offset. */ - if (a->name_length) - memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), - a->name_length * sizeof(ntfschar)); - a->name_offset = cpu_to_le16(name_ofs); - /* Setup the fields specific to non-resident attributes. */ - a->data.non_resident.lowest_vcn = 0; - a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >> - vol->cluster_size_bits); - a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); - memset(&a->data.non_resident.reserved, 0, - sizeof(a->data.non_resident.reserved)); - a->data.non_resident.allocated_size = cpu_to_sle64(new_size); - a->data.non_resident.data_size = - a->data.non_resident.initialized_size = - cpu_to_sle64(attr_size); - if (NInoSparse(ni) || NInoCompressed(ni)) { - a->data.non_resident.compression_unit = 0; - if (NInoCompressed(ni) || vol->major_ver < 3) - a->data.non_resident.compression_unit = 4; - a->data.non_resident.compressed_size = - a->data.non_resident.allocated_size; - } else - a->data.non_resident.compression_unit = 0; - /* Generate the mapping pairs array into the attribute record. */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs, - arec_size - mp_ofs, rl, 0, -1, NULL); - if (unlikely(err)) { - ntfs_debug("Failed to build mapping pairs, error code %i.", - err); - goto undo_err_out; - } - /* Setup the in-memory attribute structure to be non-resident. */ - ni->runlist.rl = rl; - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_size; - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size = ni->allocated_size; - if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << (a->data. - non_resident.compression_unit + - vol->cluster_size_bits); - ni->itype.compressed.block_size_bits = - ffs(ni->itype.compressed.block_size) - - 1; - ni->itype.compressed.block_clusters = 1U << - a->data.non_resident.compression_unit; - } else { - ni->itype.compressed.block_size = 0; - ni->itype.compressed.block_size_bits = 0; - ni->itype.compressed.block_clusters = 0; - } - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = ni->allocated_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); - /* - * This needs to be last since the address space operations ->read_folio - * and ->writepage can run concurrently with us as they are not - * serialized on i_mutex. Note, we are not allowed to fail once we flip - * this switch, which is another reason to do this last. - */ - NInoSetNonResident(ni); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - if (page) { - set_page_dirty(page); - unlock_page(page); - put_page(page); - } - ntfs_debug("Done."); - return 0; -undo_err_out: - /* Convert the attribute back into a resident attribute. */ - a->non_resident = 0; - /* Move the attribute name if it exists and update the offset. */ - name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) + - sizeof(a->data.resident.reserved) + 7) & ~7; - if (a->name_length) - memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), - a->name_length * sizeof(ntfschar)); - mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; - a->name_offset = cpu_to_le16(name_ofs); - arec_size = (mp_ofs + attr_size + 7) & ~7; - /* Resize the resident part of the attribute record. */ - err2 = ntfs_attr_record_resize(m, a, arec_size); - if (unlikely(err2)) { - /* - * This cannot happen (well if memory corruption is at work it - * could happen in theory), but deal with it as well as we can. - * If the old size is too small, truncate the attribute, - * otherwise simply give it a larger allocated size. - * FIXME: Should check whether chkdsk complains when the - * allocated size is much bigger than the resident value size. - */ - arec_size = le32_to_cpu(a->length); - if ((mp_ofs + attr_size) > arec_size) { - err2 = attr_size; - attr_size = arec_size - mp_ofs; - ntfs_error(vol->sb, "Failed to undo partial resident " - "to non-resident attribute " - "conversion. Truncating inode 0x%lx, " - "attribute type 0x%x from %i bytes to " - "%i bytes to maintain metadata " - "consistency. THIS MEANS YOU ARE " - "LOSING %i BYTES DATA FROM THIS %s.", - vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - err2, attr_size, err2 - attr_size, - ((ni->type == AT_DATA) && - !ni->name_len) ? "FILE": "ATTRIBUTE"); - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = attr_size; - i_size_write(vi, attr_size); - write_unlock_irqrestore(&ni->size_lock, flags); - } - } - /* Setup the fields specific to resident attributes. */ - a->data.resident.value_length = cpu_to_le32(attr_size); - a->data.resident.value_offset = cpu_to_le16(mp_ofs); - a->data.resident.flags = old_res_attr_flags; - memset(&a->data.resident.reserved, 0, - sizeof(a->data.resident.reserved)); - /* Copy the data from the page back to the attribute value. */ - if (page) { - kaddr = kmap_atomic(page); - memcpy((u8*)a + mp_ofs, kaddr, attr_size); - kunmap_atomic(kaddr); - } - /* Setup the allocated size in the ntfs inode in case it changed. */ - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = arec_size - mp_ofs; - write_unlock_irqrestore(&ni->size_lock, flags); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ni->runlist.rl = NULL; - up_write(&ni->runlist.lock); -rl_err_out: - if (rl) { - if (ntfs_cluster_free_from_rl(vol, rl) < 0) { - ntfs_error(vol->sb, "Failed to release allocated " - "cluster(s) in error code path. Run " - "chkdsk to recover the lost " - "cluster(s)."); - NVolSetErrors(vol); - } - ntfs_free(rl); -page_err_out: - unlock_page(page); - put_page(page); - } - if (err == -EINVAL) - err = -EIO; - return err; -} - -/** - * ntfs_attr_extend_allocation - extend the allocated space of an attribute - * @ni: ntfs inode of the attribute whose allocation to extend - * @new_alloc_size: new size in bytes to which to extend the allocation to - * @new_data_size: new size in bytes to which to extend the data to - * @data_start: beginning of region which is required to be non-sparse - * - * Extend the allocated space of an attribute described by the ntfs inode @ni - * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be - * implemented as a hole in the file (as long as both the volume and the ntfs - * inode @ni have sparse support enabled). If @data_start is >= 0, then the - * region between the old allocated size and @data_start - 1 may be made sparse - * but the regions between @data_start and @new_alloc_size must be backed by - * actual clusters. - * - * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size - * of the attribute is extended to @new_data_size. Note that the i_size of the - * vfs inode is not updated. Only the data size in the base attribute record - * is updated. The caller has to update i_size separately if this is required. - * WARNING: It is a BUG() for @new_data_size to be smaller than the old data - * size as well as for @new_data_size to be greater than @new_alloc_size. - * - * For resident attributes this involves resizing the attribute record and if - * necessary moving it and/or other attributes into extent mft records and/or - * converting the attribute to a non-resident attribute which in turn involves - * extending the allocation of a non-resident attribute as described below. - * - * For non-resident attributes this involves allocating clusters in the data - * zone on the volume (except for regions that are being made sparse) and - * extending the run list to describe the allocated clusters as well as - * updating the mapping pairs array of the attribute. This in turn involves - * resizing the attribute record and if necessary moving it and/or other - * attributes into extent mft records and/or splitting the attribute record - * into multiple extent attribute records. - * - * Also, the attribute list attribute is updated if present and in some of the - * above cases (the ones where extent mft records/attributes come into play), - * an attribute list attribute is created if not already present. - * - * Return the new allocated size on success and -errno on error. In the case - * that an error is encountered but a partial extension at least up to - * @data_start (if present) is possible, the allocation is partially extended - * and this is returned. This means the caller must check the returned size to - * determine if the extension was partial. If @data_start is -1 then partial - * allocations are not performed. - * - * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA. - * - * Locking: This function takes the runlist lock of @ni for writing as well as - * locking the mft record of the base ntfs inode. These locks are maintained - * throughout execution of the function. These locks are required so that the - * attribute can be resized safely and so that it can for example be converted - * from resident to non-resident safely. - * - * TODO: At present attribute list attribute handling is not implemented. - * - * TODO: At present it is not safe to call this function for anything other - * than the $DATA attribute(s) of an uncompressed and unencrypted file. - */ -s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, - const s64 new_data_size, const s64 data_start) -{ - VCN vcn; - s64 ll, allocated_size, start = data_start; - struct inode *vi = VFS_I(ni); - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - runlist_element *rl, *rl2; - unsigned long flags; - int err, mp_size; - u32 attr_len = 0; /* Silence stupid gcc warning. */ - bool mp_rebuilt; - -#ifdef DEBUG - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " - "old_allocated_size 0x%llx, " - "new_allocated_size 0x%llx, new_data_size 0x%llx, " - "data_start 0x%llx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)allocated_size, - (unsigned long long)new_alloc_size, - (unsigned long long)new_data_size, - (unsigned long long)start); -#endif -retry_extend: - /* - * For non-resident attributes, @start and @new_size need to be aligned - * to cluster boundaries for allocation purposes. - */ - if (NInoNonResident(ni)) { - if (start > 0) - start &= ~(s64)vol->cluster_size_mask; - new_alloc_size = (new_alloc_size + vol->cluster_size - 1) & - ~(s64)vol->cluster_size_mask; - } - BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size); - /* Check if new size is allowed in $AttrDef. */ - err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size); - if (unlikely(err)) { - /* Only emit errors when the write will fail completely. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) { - if (err == -ERANGE) { - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because the new " - "allocation would exceed the " - "maximum allowed size for " - "this attribute type.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } else { - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because this " - "attribute type is not " - "defined on the NTFS volume. " - "Possible corruption! You " - "should run chkdsk!", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } - } - /* Translate error code to be POSIX conformant for write(2). */ - if (err == -ERANGE) - err = -EFBIG; - else - err = -EIO; - return err; - } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* - * We will be modifying both the runlist (if non-resident) and the mft - * record so lock them both down. - */ - down_write(&ni->runlist.lock); - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * If non-resident, seek to the last extent. If resident, there is - * only one extent, so seek to that. - */ - vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits : - 0; - /* - * Abort if someone did the work whilst we waited for the locks. If we - * just converted the attribute from resident to non-resident it is - * likely that exactly this has happened already. We cannot quite - * abort if we need to update the data size. - */ - if (unlikely(new_alloc_size <= allocated_size)) { - ntfs_debug("Allocated size already exceeds requested size."); - new_alloc_size = allocated_size; - if (new_data_size < 0) - goto done; - /* - * We want the first attribute extent so that we can update the - * data size. - */ - vcn = 0; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, vcn, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - /* Use goto to reduce indentation. */ - if (a->non_resident) - goto do_non_resident_extend; - BUG_ON(NInoNonResident(ni)); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - /* - * Extend the attribute record to be able to store the new attribute - * size. ntfs_attr_record_resize() will not do anything if the size is - * not changing. - */ - if (new_alloc_size < vol->mft_record_size && - !ntfs_attr_record_resize(m, a, - le16_to_cpu(a->data.resident.value_offset) + - new_alloc_size)) { - /* The resize succeeded! */ - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - write_unlock_irqrestore(&ni->size_lock, flags); - if (new_data_size >= 0) { - BUG_ON(new_data_size < attr_len); - a->data.resident.value_length = - cpu_to_le32((u32)new_data_size); - } - goto flush_done; - } - /* - * We have to drop all the locks so we can call - * ntfs_attr_make_non_resident(). This could be optimised by try- - * locking the first page cache page and only if that fails dropping - * the locks, locking the page, and redoing all the locking and - * lookups. While this would be a huge optimisation, it is not worth - * it as this is definitely a slow code path. - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * Not enough space in the mft record, try to make the attribute - * non-resident and if successful restart the extension process. - */ - err = ntfs_attr_make_non_resident(ni, attr_len); - if (likely(!err)) - goto retry_extend; - /* - * Could not make non-resident. If this is due to this not being - * permitted for this attribute type or there not being enough space, - * try to make other attributes non-resident. Otherwise fail. - */ - if (unlikely(err != -EPERM && err != -ENOSPC)) { - /* Only emit errors when the write will fail completely. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the conversion from resident " - "to non-resident attribute failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - goto conv_err_out; - } - /* TODO: Not implemented from here, abort. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) { - if (err == -ENOSPC) - ntfs_error(vol->sb, "Not enough space in the mft " - "record/on disk for the non-resident " - "attribute value. This case is not " - "implemented yet."); - else /* if (err == -EPERM) */ - ntfs_error(vol->sb, "This attribute type may not be " - "non-resident. This case is not " - "implemented yet."); - } - err = -EOPNOTSUPP; - goto conv_err_out; -#if 0 - // TODO: Attempt to make other attributes non-resident. - if (!err) - goto do_resident_extend; - /* - * Both the attribute list attribute and the standard information - * attribute must remain in the base inode. Thus, if this is one of - * these attributes, we have to try to move other attributes out into - * extent mft records instead. - */ - if (ni->type == AT_ATTRIBUTE_LIST || - ni->type == AT_STANDARD_INFORMATION) { - // TODO: Attempt to move other attributes into extent mft - // records. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - goto err_out; - } - // TODO: Attempt to move this attribute to an extent mft record, but - // only if it is not already the only attribute in an mft record in - // which case there would be nothing to gain. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - /* There is nothing we can do to make enough space. )-: */ - goto err_out; -#endif -do_non_resident_extend: - BUG_ON(!NInoNonResident(ni)); - if (new_alloc_size == allocated_size) { - BUG_ON(vcn); - goto alloc_done; - } - /* - * If the data starts after the end of the old allocation, this is a - * $DATA attribute and sparse attributes are enabled on the volume and - * for this inode, then create a sparse region between the old - * allocated size and the start of the data. Otherwise simply proceed - * with filling the whole space between the old allocated size and the - * new allocated size with clusters. - */ - if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA || - !NVolSparseEnabled(vol) || NInoSparseDisabled(ni)) - goto skip_sparse; - // TODO: This is not implemented yet. We just fill in with real - // clusters for now... - ntfs_debug("Inserting holes is not-implemented yet. Falling back to " - "allocating real clusters instead."); -skip_sparse: - rl = ni->runlist.rl; - if (likely(rl)) { - /* Seek to the end of the runlist. */ - while (rl->length) - rl++; - } - /* If this attribute extent is not mapped, map it now. */ - if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED || - (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl && - (rl-1)->lcn == LCN_RL_NOT_MAPPED))) { - if (!rl && !allocated_size) - goto first_alloc; - rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because the " - "mapping of a runlist " - "fragment failed with error " - "code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - err); - if (err != -ENOMEM) - err = -EIO; - goto err_out; - } - ni->runlist.rl = rl; - /* Seek to the end of the runlist. */ - while (rl->length) - rl++; - } - /* - * We now know the runlist of the last extent is mapped and @rl is at - * the end of the runlist. We want to begin allocating clusters - * starting at the last allocated cluster to reduce fragmentation. If - * there are no valid LCNs in the attribute we let the cluster - * allocator choose the starting cluster. - */ - /* If the last LCN is a hole or simillar seek back to last real LCN. */ - while (rl->lcn < 0 && rl > ni->runlist.rl) - rl--; -first_alloc: - // FIXME: Need to implement partial allocations so at least part of the - // write can be performed when start >= 0. (Needed for POSIX write(2) - // conformance.) - rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits, - (new_alloc_size - allocated_size) >> - vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ? - rl->lcn + rl->length : -1, DATA_ZONE, true); - if (IS_ERR(rl2)) { - err = PTR_ERR(rl2); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the allocation of clusters " - "failed with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM && err != -ENOSPC) - err = -EIO; - goto err_out; - } - rl = ntfs_runlists_merge(ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the runlist merge failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to release allocated " - "cluster(s) in error code path. Run " - "chkdsk to recover the lost " - "cluster(s)."); - NVolSetErrors(vol); - } - ntfs_free(rl2); - goto err_out; - } - ni->runlist.rl = rl; - ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size - - allocated_size) >> vol->cluster_size_bits); - /* Find the runlist element with which the attribute extent starts. */ - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - rl2 = ntfs_rl_find_vcn_nolock(rl, ll); - BUG_ON(!rl2); - BUG_ON(!rl2->length); - BUG_ON(rl2->lcn < LCN_HOLE); - mp_rebuilt = false; - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - err = mp_size; - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because determining the size for the " - "mapping pairs failed with error code " - "%i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - goto undo_alloc; - } - /* Extend the attribute record to fit the bigger mapping pairs array. */ - attr_len = le32_to_cpu(a->length); - err = ntfs_attr_record_resize(m, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(err)) { - BUG_ON(err != -ENOSPC); - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record, - // possibly by extending this extent partially and filling it - // and creating a new extent for the remainder, or by making - // other attributes non-resident and/or by moving other - // attributes out of this mft record. - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Not enough space in the mft " - "record for the extended attribute " - "record. This case is not " - "implemented yet."); - err = -EOPNOTSUPP; - goto undo_alloc; - } - mp_rebuilt = true; - /* Generate the mapping pairs array directly into the attr record. */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(err)) { - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because building the mapping pairs " - "failed with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - goto undo_alloc; - } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> - vol->cluster_size_bits) - 1); - /* - * We now have extended the allocated size of the attribute. Reflect - * this in the ntfs_inode structure and the attribute record. - */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto restore_undo_alloc; - /* @m is not used any more so no need to set it. */ - a = ctx->attr; - } - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); - /* - * FIXME: This would fail if @ni is a directory, $MFT, or an index, - * since those can have sparse/compressed set. For example can be - * set compressed even though it is not compressed itself and in that - * case the bit means that files are to be created compressed in the - * directory... At present this is ok as this code is only called for - * regular files, and only for their $DATA attribute(s). - * FIXME: The calculation is wrong if we created a hole above. For now - * it does not matter as we never create holes. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size += new_alloc_size - allocated_size; - a->data.non_resident.compressed_size = - cpu_to_sle64(ni->itype.compressed.size); - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); -alloc_done: - if (new_data_size >= 0) { - BUG_ON(new_data_size < - sle64_to_cpu(a->data.non_resident.data_size)); - a->data.non_resident.data_size = cpu_to_sle64(new_data_size); - } -flush_done: - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -done: - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - ntfs_debug("Done, new_allocated_size 0x%llx.", - (unsigned long long)new_alloc_size); - return new_alloc_size; -restore_undo_alloc: - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot complete extension of allocation " - "of inode 0x%lx, attribute type 0x%x, because " - "lookup of first attribute extent failed with " - "error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err == -ENOENT) - err = -EIO; - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, - allocated_size >> vol->cluster_size_bits, NULL, 0, - ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "attribute in error code path. Run chkdsk to " - "recover."); - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - /* - * FIXME: This would fail if @ni is a directory... See above. - * FIXME: The calculation is wrong if we created a hole above. - * For now it does not matter as we never create holes. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size += new_alloc_size - - allocated_size; - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * The only thing that is now wrong is the allocated size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); - return err; - } - ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64( - (allocated_size >> vol->cluster_size_bits) - 1); -undo_alloc: - ll = allocated_size >> vol->cluster_size_bits; - if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) { - ntfs_error(vol->sb, "Failed to release allocated cluster(s) " - "in error code path. Run chkdsk to recover " - "the lost cluster(s)."); - NVolSetErrors(vol); - } - m = ctx->mrec; - a = ctx->attr; - /* - * If the runlist truncation fails and/or the search context is no - * longer valid, we cannot resize the attribute record or build the - * mapping pairs array thus we mark the inode bad so that no access to - * the freed clusters can happen. - */ - if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) { - ntfs_error(vol->sb, "Failed to %s in error code path. Run " - "chkdsk to recover.", IS_ERR(m) ? - "restore attribute search context" : - "truncate attribute runlist"); - NVolSetErrors(vol); - } else if (mp_rebuilt) { - if (ntfs_attr_record_resize(m, a, attr_len)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record in error code path. Run " - "chkdsk to recover."); - NVolSetErrors(vol); - } else /* if (success) */ { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident. - mapping_pairs_offset), attr_len - - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), rl2, ll, -1, - NULL)) { - ntfs_error(vol->sb, "Failed to restore " - "mapping pairs array in error " - "code path. Run chkdsk to " - "recover."); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } - } -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -conv_err_out: - ntfs_debug("Failed. Returning error code %i.", err); - return err; -} - -/** - * ntfs_attr_set - fill (a part of) an attribute with a byte - * @ni: ntfs inode describing the attribute to fill - * @ofs: offset inside the attribute at which to start to fill - * @cnt: number of bytes to fill - * @val: the unsigned 8-bit value with which to fill the attribute - * - * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at - * byte offset @ofs inside the attribute with the constant byte @val. - * - * This function is effectively like memset() applied to an ntfs attribute. - * Note this function actually only operates on the page cache pages belonging - * to the ntfs attribute and it marks them dirty after doing the memset(). - * Thus it relies on the vm dirty page write code paths to cause the modified - * pages to be written to the mft record/disk. - * - * Return 0 on success and -errno on error. An error code of -ESPIPE means - * that @ofs + @cnt were outside the end of the attribute and no write was - * performed. - */ -int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) -{ - ntfs_volume *vol = ni->vol; - struct address_space *mapping; - struct page *page; - u8 *kaddr; - pgoff_t idx, end; - unsigned start_ofs, end_ofs, size; - - ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", - (long long)ofs, (long long)cnt, val); - BUG_ON(ofs < 0); - BUG_ON(cnt < 0); - if (!cnt) - goto done; - /* - * FIXME: Compressed and encrypted attributes are not supported when - * writing and we should never have gotten here for them. - */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoEncrypted(ni)); - mapping = VFS_I(ni)->i_mapping; - /* Work out the starting index and page offset. */ - idx = ofs >> PAGE_SHIFT; - start_ofs = ofs & ~PAGE_MASK; - /* Work out the ending index and page offset. */ - end = ofs + cnt; - end_ofs = end & ~PAGE_MASK; - /* If the end is outside the inode size return -ESPIPE. */ - if (unlikely(end > i_size_read(VFS_I(ni)))) { - ntfs_error(vol->sb, "Request exceeds end of attribute."); - return -ESPIPE; - } - end >>= PAGE_SHIFT; - /* If there is a first partial page, need to do it the slow way. */ - if (start_ofs) { - page = read_mapping_page(mapping, idx, NULL); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read first partial " - "page (error, index 0x%lx).", idx); - return PTR_ERR(page); - } - /* - * If the last page is the same as the first page, need to - * limit the write to the end offset. - */ - size = PAGE_SIZE; - if (idx == end) - size = end_ofs; - kaddr = kmap_atomic(page); - memset(kaddr + start_ofs, val, size - start_ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - set_page_dirty(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - if (idx == end) - goto done; - idx++; - } - /* Do the whole pages the fast way. */ - for (; idx < end; idx++) { - /* Find or create the current page. (The page is locked.) */ - page = grab_cache_page(mapping, idx); - if (unlikely(!page)) { - ntfs_error(vol->sb, "Insufficient memory to grab " - "page (index 0x%lx).", idx); - return -ENOMEM; - } - kaddr = kmap_atomic(page); - memset(kaddr, val, PAGE_SIZE); - flush_dcache_page(page); - kunmap_atomic(kaddr); - /* - * If the page has buffers, mark them uptodate since buffer - * state and not page state is definitive in 2.6 kernels. - */ - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - set_buffer_uptodate(bh); - } while ((bh = bh->b_this_page) != head); - } - /* Now that buffers are uptodate, set the page uptodate, too. */ - SetPageUptodate(page); - /* - * Set the page and all its buffers dirty and mark the inode - * dirty, too. The VM will write the page later on. - */ - set_page_dirty(page); - /* Finally unlock and release the page. */ - unlock_page(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } - /* If there is a last partial page, need to do it the slow way. */ - if (end_ofs) { - page = read_mapping_page(mapping, idx, NULL); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read last partial page " - "(error, index 0x%lx).", idx); - return PTR_ERR(page); - } - kaddr = kmap_atomic(page); - memset(kaddr, val, end_ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - set_page_dirty(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } -done: - ntfs_debug("Done."); - return 0; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h deleted file mode 100644 index fe0890d3d072..000000000000 --- a/fs/ntfs/attrib.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * attrib.h - Defines for attribute handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_ATTRIB_H -#define _LINUX_NTFS_ATTRIB_H - -#include "endian.h" -#include "types.h" -#include "layout.h" -#include "inode.h" -#include "runlist.h" -#include "volume.h" - -/** - * ntfs_attr_search_ctx - used in attribute search functions - * @mrec: buffer containing mft record to search - * @attr: attribute record in @mrec where to begin/continue search - * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after - * - * Structure must be initialized to zero before the first call to one of the - * attribute search functions. Initialize @mrec to point to the mft record to - * search, and @attr to point to the first attribute within @mrec (not necessary - * if calling the _first() functions), and set @is_first to 'true' (not necessary - * if calling the _first() functions). - * - * If @is_first is 'true', the search begins with @attr. If @is_first is 'false', - * the search begins after @attr. This is so that, after the first call to one - * of the search attribute functions, we can call the function again, without - * any modification of the search context, to automagically get the next - * matching attribute. - */ -typedef struct { - MFT_RECORD *mrec; - ATTR_RECORD *attr; - bool is_first; - ntfs_inode *ntfs_ino; - ATTR_LIST_ENTRY *al_entry; - ntfs_inode *base_ntfs_ino; - MFT_RECORD *base_mrec; - ATTR_RECORD *base_attr; -} ntfs_attr_search_ctx; - -extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, - ntfs_attr_search_ctx *ctx); -extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); - -extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, - const bool write_locked); - -extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, - const VCN vcn, ntfs_attr_search_ctx *ctx); - -int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const VCN lowest_vcn, const u8 *val, const u32 val_len, - ntfs_attr_search_ctx *ctx); - -extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start, - const s64 size, const s64 initialized_size); - -static inline s64 ntfs_attr_size(const ATTR_RECORD *a) -{ - if (!a->non_resident) - return (s64)le32_to_cpu(a->data.resident.value_length); - return sle64_to_cpu(a->data.non_resident.data_size); -} - -extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx); -extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, - MFT_RECORD *mrec); -extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx); - -#ifdef NTFS_RW - -extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol, - const ATTR_TYPE type, const s64 size); -extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, - const ATTR_TYPE type); -extern int ntfs_attr_can_be_resident(const ntfs_volume *vol, - const ATTR_TYPE type); - -extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size); -extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, - const u32 new_size); - -extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size); - -extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, - const s64 new_data_size, const s64 data_start); - -extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, - const u8 val); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_ATTRIB_H */ diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c deleted file mode 100644 index 0675b2400873..000000000000 --- a/fs/ntfs/bitmap.c +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include <linux/pagemap.h> - -#include "bitmap.h" -#include "debug.h" -#include "aops.h" -#include "ntfs.h" - -/** - * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to set - * @count: number of bits to set - * @value: value to set the bits to (i.e. 0 or 1) - * @is_rollback: if 'true' this is a rollback operation - * - * Set @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi to @value, where @value is either 0 or 1. - * - * @is_rollback should always be 'false', it is for internal use to rollback - * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead. - * - * Return 0 on success and -errno on error. - */ -int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, - const s64 count, const u8 value, const bool is_rollback) -{ - s64 cnt = count; - pgoff_t index, end_index; - struct address_space *mapping; - struct page *page; - u8 *kaddr; - int pos, len; - u8 bit; - - BUG_ON(!vi); - ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, " - "value %u.%s", vi->i_ino, (unsigned long long)start_bit, - (unsigned long long)cnt, (unsigned int)value, - is_rollback ? " (rollback)" : ""); - BUG_ON(start_bit < 0); - BUG_ON(cnt < 0); - BUG_ON(value > 1); - /* - * Calculate the indices for the pages containing the first and last - * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively. - */ - index = start_bit >> (3 + PAGE_SHIFT); - end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT); - - /* Get the page containing the first bit (@start_bit). */ - mapping = vi->i_mapping; - page = ntfs_map_page(mapping, index); - if (IS_ERR(page)) { - if (!is_rollback) - ntfs_error(vi->i_sb, "Failed to map first page (error " - "%li), aborting.", PTR_ERR(page)); - return PTR_ERR(page); - } - kaddr = page_address(page); - - /* Set @pos to the position of the byte containing @start_bit. */ - pos = (start_bit >> 3) & ~PAGE_MASK; - - /* Calculate the position of @start_bit in the first byte. */ - bit = start_bit & 7; - - /* If the first byte is partial, modify the appropriate bits in it. */ - if (bit) { - u8 *byte = kaddr + pos; - while ((bit & 7) && cnt) { - cnt--; - if (value) - *byte |= 1 << bit++; - else - *byte &= ~(1 << bit++); - } - /* If we are done, unmap the page and return success. */ - if (!cnt) - goto done; - - /* Update @pos to the new position. */ - pos++; - } - /* - * Depending on @value, modify all remaining whole bytes in the page up - * to @cnt. - */ - len = min_t(s64, cnt >> 3, PAGE_SIZE - pos); - memset(kaddr + pos, value ? 0xff : 0, len); - cnt -= len << 3; - - /* Update @len to point to the first not-done byte in the page. */ - if (cnt < 8) - len += pos; - - /* If we are not in the last page, deal with all subsequent pages. */ - while (index < end_index) { - BUG_ON(cnt <= 0); - - /* Update @index and get the next page. */ - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - page = ntfs_map_page(mapping, ++index); - if (IS_ERR(page)) - goto rollback; - kaddr = page_address(page); - /* - * Depending on @value, modify all remaining whole bytes in the - * page up to @cnt. - */ - len = min_t(s64, cnt >> 3, PAGE_SIZE); - memset(kaddr, value ? 0xff : 0, len); - cnt -= len << 3; - } - /* - * The currently mapped page is the last one. If the last byte is - * partial, modify the appropriate bits in it. Note, @len is the - * position of the last byte inside the page. - */ - if (cnt) { - u8 *byte; - - BUG_ON(cnt > 7); - - bit = cnt; - byte = kaddr + len; - while (bit--) { - if (value) - *byte |= 1 << bit; - else - *byte &= ~(1 << bit); - } - } -done: - /* We are done. Unmap the page and return success. */ - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - ntfs_debug("Done."); - return 0; -rollback: - /* - * Current state: - * - no pages are mapped - * - @count - @cnt is the number of bits that have been modified - */ - if (is_rollback) - return PTR_ERR(page); - if (count != cnt) - pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt, - value ? 0 : 1, true); - else - pos = 0; - if (!pos) { - /* Rollback was successful. */ - ntfs_error(vi->i_sb, "Failed to map subsequent page (error " - "%li), aborting.", PTR_ERR(page)); - } else { - /* Rollback failed. */ - ntfs_error(vi->i_sb, "Failed to map subsequent page (error " - "%li) and rollback failed (error %i). " - "Aborting and leaving inconsistent metadata. " - "Unmount and run chkdsk.", PTR_ERR(page), pos); - NVolSetErrors(NTFS_SB(vi->i_sb)); - } - return PTR_ERR(page); -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h deleted file mode 100644 index 9dd2224ca9c4..000000000000 --- a/fs/ntfs/bitmap.h +++ /dev/null @@ -1,104 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_BITMAP_H -#define _LINUX_NTFS_BITMAP_H - -#ifdef NTFS_RW - -#include <linux/fs.h> - -#include "types.h" - -extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, - const s64 count, const u8 value, const bool is_rollback); - -/** - * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to set - * @count: number of bits to set - * @value: value to set the bits to (i.e. 0 or 1) - * - * Set @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi to @value, where @value is either 0 or 1. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi, - const s64 start_bit, const s64 count, const u8 value) -{ - return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value, - false); -} - -/** - * ntfs_bitmap_set_run - set a run of bits in a bitmap - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to set - * @count: number of bits to set - * - * Set @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit, - const s64 count) -{ - return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1); -} - -/** - * ntfs_bitmap_clear_run - clear a run of bits in a bitmap - * @vi: vfs inode describing the bitmap - * @start_bit: first bit to clear - * @count: number of bits to clear - * - * Clear @count bits starting at bit @start_bit in the bitmap described by the - * vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit, - const s64 count) -{ - return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0); -} - -/** - * ntfs_bitmap_set_bit - set a bit in a bitmap - * @vi: vfs inode describing the bitmap - * @bit: bit to set - * - * Set bit @bit in the bitmap described by the vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit) -{ - return ntfs_bitmap_set_run(vi, bit, 1); -} - -/** - * ntfs_bitmap_clear_bit - clear a bit in a bitmap - * @vi: vfs inode describing the bitmap - * @bit: bit to clear - * - * Clear bit @bit in the bitmap described by the vfs inode @vi. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit) -{ - return ntfs_bitmap_clear_run(vi, bit, 1); -} - -#endif /* NTFS_RW */ - -#endif /* defined _LINUX_NTFS_BITMAP_H */ diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c deleted file mode 100644 index 3ab6ec96abfe..000000000000 --- a/fs/ntfs/collate.c +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#include "collate.h" -#include "debug.h" -#include "ntfs.h" - -static int ntfs_collate_binary(ntfs_volume *vol, - const void *data1, const int data1_len, - const void *data2, const int data2_len) -{ - int rc; - - ntfs_debug("Entering."); - rc = memcmp(data1, data2, min(data1_len, data2_len)); - if (!rc && (data1_len != data2_len)) { - if (data1_len < data2_len) - rc = -1; - else - rc = 1; - } - ntfs_debug("Done, returning %i", rc); - return rc; -} - -static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, - const void *data1, const int data1_len, - const void *data2, const int data2_len) -{ - int rc; - u32 d1, d2; - - ntfs_debug("Entering."); - // FIXME: We don't really want to bug here. - BUG_ON(data1_len != data2_len); - BUG_ON(data1_len != 4); - d1 = le32_to_cpup(data1); - d2 = le32_to_cpup(data2); - if (d1 < d2) - rc = -1; - else { - if (d1 == d2) - rc = 0; - else - rc = 1; - } - ntfs_debug("Done, returning %i", rc); - return rc; -} - -typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int, - const void *, const int); - -static ntfs_collate_func_t ntfs_do_collate0x0[3] = { - ntfs_collate_binary, - NULL/*ntfs_collate_file_name*/, - NULL/*ntfs_collate_unicode_string*/, -}; - -static ntfs_collate_func_t ntfs_do_collate0x1[4] = { - ntfs_collate_ntofs_ulong, - NULL/*ntfs_collate_ntofs_sid*/, - NULL/*ntfs_collate_ntofs_security_hash*/, - NULL/*ntfs_collate_ntofs_ulongs*/, -}; - -/** - * ntfs_collate - collate two data items using a specified collation rule - * @vol: ntfs volume to which the data items belong - * @cr: collation rule to use when comparing the items - * @data1: first data item to collate - * @data1_len: length in bytes of @data1 - * @data2: second data item to collate - * @data2_len: length in bytes of @data2 - * - * Collate the two data items @data1 and @data2 using the collation rule @cr - * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before, - * to match, or to collate after @data2. - * - * For speed we use the collation rule @cr as an index into two tables of - * function pointers to call the appropriate collation function. - */ -int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, - const void *data1, const int data1_len, - const void *data2, const int data2_len) { - int i; - - ntfs_debug("Entering."); - /* - * FIXME: At the moment we only support COLLATION_BINARY and - * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now. - */ - BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG); - i = le32_to_cpu(cr); - BUG_ON(i < 0); - if (i <= 0x02) - return ntfs_do_collate0x0[i](vol, data1, data1_len, - data2, data2_len); - BUG_ON(i < 0x10); - i -= 0x10; - if (likely(i <= 3)) - return ntfs_do_collate0x1[i](vol, data1, data1_len, - data2, data2_len); - BUG(); - return 0; -} diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h deleted file mode 100644 index f2255619b4f4..000000000000 --- a/fs/ntfs/collate.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * collate.h - Defines for NTFS kernel collation handling. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_COLLATE_H -#define _LINUX_NTFS_COLLATE_H - -#include "types.h" -#include "volume.h" - -static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) { - int i; - - /* - * FIXME: At the moment we only support COLLATION_BINARY and - * COLLATION_NTOFS_ULONG, so we return false for everything else for - * now. - */ - if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG)) - return false; - i = le32_to_cpu(cr); - if (likely(((i >= 0) && (i <= 0x02)) || - ((i >= 0x10) && (i <= 0x13)))) - return true; - return false; -} - -extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, - const void *data1, const int data1_len, - const void *data2, const int data2_len); - -#endif /* _LINUX_NTFS_COLLATE_H */ diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c deleted file mode 100644 index 761aaa0195d6..000000000000 --- a/fs/ntfs/compress.c +++ /dev/null @@ -1,950 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * compress.c - NTFS kernel compressed attributes handling. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/fs.h> -#include <linux/buffer_head.h> -#include <linux/blkdev.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include "attrib.h" -#include "inode.h" -#include "debug.h" -#include "ntfs.h" - -/** - * ntfs_compression_constants - enum of constants used in the compression code - */ -typedef enum { - /* Token types and access mask. */ - NTFS_SYMBOL_TOKEN = 0, - NTFS_PHRASE_TOKEN = 1, - NTFS_TOKEN_MASK = 1, - - /* Compression sub-block constants. */ - NTFS_SB_SIZE_MASK = 0x0fff, - NTFS_SB_SIZE = 0x1000, - NTFS_SB_IS_COMPRESSED = 0x8000, - - /* - * The maximum compression block size is by definition 16 * the cluster - * size, with the maximum supported cluster size being 4kiB. Thus the - * maximum compression buffer size is 64kiB, so we use this when - * initializing the compression buffer. - */ - NTFS_MAX_CB_SIZE = 64 * 1024, -} ntfs_compression_constants; - -/* - * ntfs_compression_buffer - one buffer for the decompression engine - */ -static u8 *ntfs_compression_buffer; - -/* - * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer - */ -static DEFINE_SPINLOCK(ntfs_cb_lock); - -/** - * allocate_compression_buffers - allocate the decompression buffers - * - * Caller has to hold the ntfs_lock mutex. - * - * Return 0 on success or -ENOMEM if the allocations failed. - */ -int allocate_compression_buffers(void) -{ - BUG_ON(ntfs_compression_buffer); - - ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE); - if (!ntfs_compression_buffer) - return -ENOMEM; - return 0; -} - -/** - * free_compression_buffers - free the decompression buffers - * - * Caller has to hold the ntfs_lock mutex. - */ -void free_compression_buffers(void) -{ - BUG_ON(!ntfs_compression_buffer); - vfree(ntfs_compression_buffer); - ntfs_compression_buffer = NULL; -} - -/** - * zero_partial_compressed_page - zero out of bounds compressed page region - */ -static void zero_partial_compressed_page(struct page *page, - const s64 initialized_size) -{ - u8 *kp = page_address(page); - unsigned int kp_ofs; - - ntfs_debug("Zeroing page region outside initialized size."); - if (((s64)page->index << PAGE_SHIFT) >= initialized_size) { - clear_page(kp); - return; - } - kp_ofs = initialized_size & ~PAGE_MASK; - memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs); - return; -} - -/** - * handle_bounds_compressed_page - test for&handle out of bounds compressed page - */ -static inline void handle_bounds_compressed_page(struct page *page, - const loff_t i_size, const s64 initialized_size) -{ - if ((page->index >= (initialized_size >> PAGE_SHIFT)) && - (initialized_size < i_size)) - zero_partial_compressed_page(page, initialized_size); - return; -} - -/** - * ntfs_decompress - decompress a compression block into an array of pages - * @dest_pages: destination array of pages - * @completed_pages: scratch space to track completed pages - * @dest_index: current index into @dest_pages (IN/OUT) - * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) - * @dest_max_index: maximum index into @dest_pages (IN) - * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN) - * @xpage: the target page (-1 if none) (IN) - * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT) - * @cb_start: compression block to decompress (IN) - * @cb_size: size of compression block @cb_start in bytes (IN) - * @i_size: file size when we started the read (IN) - * @initialized_size: initialized file size when we started the read (IN) - * - * The caller must have disabled preemption. ntfs_decompress() reenables it when - * the critical section is finished. - * - * This decompresses the compression block @cb_start into the array of - * destination pages @dest_pages starting at index @dest_index into @dest_pages - * and at offset @dest_pos into the page @dest_pages[@dest_index]. - * - * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1. - * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified. - * - * @cb_start is a pointer to the compression block which needs decompressing - * and @cb_size is the size of @cb_start in bytes (8-64kiB). - * - * Return 0 if success or -EOVERFLOW on error in the compressed stream. - * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was - * completed during the decompression of the compression block (@cb_start). - * - * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up - * unpredicatbly! You have been warned! - * - * Note to hackers: This function may not sleep until it has finished accessing - * the compression block @cb_start as it is a per-CPU buffer. - */ -static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], - int *dest_index, int *dest_ofs, const int dest_max_index, - const int dest_max_ofs, const int xpage, char *xpage_done, - u8 *const cb_start, const u32 cb_size, const loff_t i_size, - const s64 initialized_size) -{ - /* - * Pointers into the compressed data, i.e. the compression block (cb), - * and the therein contained sub-blocks (sb). - */ - u8 *cb_end = cb_start + cb_size; /* End of cb. */ - u8 *cb = cb_start; /* Current position in cb. */ - u8 *cb_sb_start; /* Beginning of the current sb in the cb. */ - u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ - - /* Variables for uncompressed data / destination. */ - struct page *dp; /* Current destination page being worked on. */ - u8 *dp_addr; /* Current pointer into dp. */ - u8 *dp_sb_start; /* Start of current sub-block in dp. */ - u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + - NTFS_SB_SIZE). */ - u16 do_sb_start; /* @dest_ofs when starting this sub-block. */ - u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + - NTFS_SB_SIZE). */ - - /* Variables for tag and token parsing. */ - u8 tag; /* Current tag. */ - int token; /* Loop counter for the eight tokens in tag. */ - int nr_completed_pages = 0; - - /* Default error code. */ - int err = -EOVERFLOW; - - ntfs_debug("Entering, cb_size = 0x%x.", cb_size); -do_next_sb: - ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.", - cb - cb_start); - /* - * Have we reached the end of the compression block or the end of the - * decompressed data? The latter can happen for example if the current - * position in the compression block is one byte before its end so the - * first two checks do not detect it. - */ - if (cb == cb_end || !le16_to_cpup((le16*)cb) || - (*dest_index == dest_max_index && - *dest_ofs == dest_max_ofs)) { - int i; - - ntfs_debug("Completed. Returning success (0)."); - err = 0; -return_error: - /* We can sleep from now on, so we drop lock. */ - spin_unlock(&ntfs_cb_lock); - /* Second stage: finalize completed pages. */ - if (nr_completed_pages > 0) { - for (i = 0; i < nr_completed_pages; i++) { - int di = completed_pages[i]; - - dp = dest_pages[di]; - /* - * If we are outside the initialized size, zero - * the out of bounds page range. - */ - handle_bounds_compressed_page(dp, i_size, - initialized_size); - flush_dcache_page(dp); - kunmap(dp); - SetPageUptodate(dp); - unlock_page(dp); - if (di == xpage) - *xpage_done = 1; - else - put_page(dp); - dest_pages[di] = NULL; - } - } - return err; - } - - /* Setup offsets for the current sub-block destination. */ - do_sb_start = *dest_ofs; - do_sb_end = do_sb_start + NTFS_SB_SIZE; - - /* Check that we are still within allowed boundaries. */ - if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs) - goto return_overflow; - - /* Does the minimum size of a compressed sb overflow valid range? */ - if (cb + 6 > cb_end) - goto return_overflow; - - /* Setup the current sub-block source pointers and validate range. */ - cb_sb_start = cb; - cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) - + 3; - if (cb_sb_end > cb_end) - goto return_overflow; - - /* Get the current destination page. */ - dp = dest_pages[*dest_index]; - if (!dp) { - /* No page present. Skip decompression of this sub-block. */ - cb = cb_sb_end; - - /* Advance destination position to next sub-block. */ - *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK; - if (!*dest_ofs && (++*dest_index > dest_max_index)) - goto return_overflow; - goto do_next_sb; - } - - /* We have a valid destination page. Setup the destination pointers. */ - dp_addr = (u8*)page_address(dp) + do_sb_start; - - /* Now, we are ready to process the current sub-block (sb). */ - if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { - ntfs_debug("Found uncompressed sub-block."); - /* This sb is not compressed, just copy it into destination. */ - - /* Advance source position to first data byte. */ - cb += 2; - - /* An uncompressed sb must be full size. */ - if (cb_sb_end - cb != NTFS_SB_SIZE) - goto return_overflow; - - /* Copy the block and advance the source position. */ - memcpy(dp_addr, cb, NTFS_SB_SIZE); - cb += NTFS_SB_SIZE; - - /* Advance destination position to next sub-block. */ - *dest_ofs += NTFS_SB_SIZE; - if (!(*dest_ofs &= ~PAGE_MASK)) { -finalize_page: - /* - * First stage: add current page index to array of - * completed pages. - */ - completed_pages[nr_completed_pages++] = *dest_index; - if (++*dest_index > dest_max_index) - goto return_overflow; - } - goto do_next_sb; - } - ntfs_debug("Found compressed sub-block."); - /* This sb is compressed, decompress it into destination. */ - - /* Setup destination pointers. */ - dp_sb_start = dp_addr; - dp_sb_end = dp_sb_start + NTFS_SB_SIZE; - - /* Forward to the first tag in the sub-block. */ - cb += 2; -do_next_tag: - if (cb == cb_sb_end) { - /* Check if the decompressed sub-block was not full-length. */ - if (dp_addr < dp_sb_end) { - int nr_bytes = do_sb_end - *dest_ofs; - - ntfs_debug("Filling incomplete sub-block with " - "zeroes."); - /* Zero remainder and update destination position. */ - memset(dp_addr, 0, nr_bytes); - *dest_ofs += nr_bytes; - } - /* We have finished the current sub-block. */ - if (!(*dest_ofs &= ~PAGE_MASK)) - goto finalize_page; - goto do_next_sb; - } - - /* Check we are still in range. */ - if (cb > cb_sb_end || dp_addr > dp_sb_end) - goto return_overflow; - - /* Get the next tag and advance to first token. */ - tag = *cb++; - - /* Parse the eight tokens described by the tag. */ - for (token = 0; token < 8; token++, tag >>= 1) { - u16 lg, pt, length, max_non_overlap; - register u16 i; - u8 *dp_back_addr; - - /* Check if we are done / still in range. */ - if (cb >= cb_sb_end || dp_addr > dp_sb_end) - break; - - /* Determine token type and parse appropriately.*/ - if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) { - /* - * We have a symbol token, copy the symbol across, and - * advance the source and destination positions. - */ - *dp_addr++ = *cb++; - ++*dest_ofs; - - /* Continue with the next token. */ - continue; - } - - /* - * We have a phrase token. Make sure it is not the first tag in - * the sb as this is illegal and would confuse the code below. - */ - if (dp_addr == dp_sb_start) - goto return_overflow; - - /* - * Determine the number of bytes to go back (p) and the number - * of bytes to copy (l). We use an optimized algorithm in which - * we first calculate log2(current destination position in sb), - * which allows determination of l and p in O(1) rather than - * O(n). We just need an arch-optimized log2() function now. - */ - lg = 0; - for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1) - lg++; - - /* Get the phrase token into i. */ - pt = le16_to_cpup((le16*)cb); - - /* - * Calculate starting position of the byte sequence in - * the destination using the fact that p = (pt >> (12 - lg)) + 1 - * and make sure we don't go too far back. - */ - dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1; - if (dp_back_addr < dp_sb_start) - goto return_overflow; - - /* Now calculate the length of the byte sequence. */ - length = (pt & (0xfff >> lg)) + 3; - - /* Advance destination position and verify it is in range. */ - *dest_ofs += length; - if (*dest_ofs > do_sb_end) - goto return_overflow; - - /* The number of non-overlapping bytes. */ - max_non_overlap = dp_addr - dp_back_addr; - - if (length <= max_non_overlap) { - /* The byte sequence doesn't overlap, just copy it. */ - memcpy(dp_addr, dp_back_addr, length); - - /* Advance destination pointer. */ - dp_addr += length; - } else { - /* - * The byte sequence does overlap, copy non-overlapping - * part and then do a slow byte by byte copy for the - * overlapping part. Also, advance the destination - * pointer. - */ - memcpy(dp_addr, dp_back_addr, max_non_overlap); - dp_addr += max_non_overlap; - dp_back_addr += max_non_overlap; - length -= max_non_overlap; - while (length--) - *dp_addr++ = *dp_back_addr++; - } - - /* Advance source position and continue with the next token. */ - cb += 2; - } - - /* No tokens left in the current tag. Continue with the next tag. */ - goto do_next_tag; - -return_overflow: - ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); - goto return_error; -} - -/** - * ntfs_read_compressed_block - read a compressed block into the page cache - * @page: locked page in the compression block(s) we need to read - * - * When we are called the page has already been verified to be locked and the - * attribute is known to be non-resident, not encrypted, but compressed. - * - * 1. Determine which compression block(s) @page is in. - * 2. Get hold of all pages corresponding to this/these compression block(s). - * 3. Read the (first) compression block. - * 4. Decompress it into the corresponding pages. - * 5. Throw the compressed data away and proceed to 3. for the next compression - * block or return success if no more compression blocks left. - * - * Warning: We have to be careful what we do about existing pages. They might - * have been written to so that we would lose data if we were to just overwrite - * them with the out-of-date uncompressed data. - * - * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at - * the end of the file I think. We need to detect this case and zero the out - * of bounds remainder of the page in question and mark it as handled. At the - * moment we would just return -EIO on such a page. This bug will only become - * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte - * clusters so is probably not going to be seen by anyone. Still this should - * be fixed. (AIA) - * - * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in - * handling sparse and compressed cbs. (AIA) - * - * FIXME: At the moment we don't do any zeroing out in the case that - * initialized_size is less than data_size. This should be safe because of the - * nature of the compression algorithm used. Just in case we check and output - * an error message in read inode if the two sizes are not equal for a - * compressed file. (AIA) - */ -int ntfs_read_compressed_block(struct page *page) -{ - loff_t i_size; - s64 initialized_size; - struct address_space *mapping = page->mapping; - ntfs_inode *ni = NTFS_I(mapping->host); - ntfs_volume *vol = ni->vol; - struct super_block *sb = vol->sb; - runlist_element *rl; - unsigned long flags, block_size = sb->s_blocksize; - unsigned char block_size_bits = sb->s_blocksize_bits; - u8 *cb, *cb_pos, *cb_end; - struct buffer_head **bhs; - unsigned long offset, index = page->index; - u32 cb_size = ni->itype.compressed.block_size; - u64 cb_size_mask = cb_size - 1UL; - VCN vcn; - LCN lcn; - /* The first wanted vcn (minimum alignment is PAGE_SIZE). */ - VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> - vol->cluster_size_bits; - /* - * The first vcn after the last wanted vcn (minimum alignment is again - * PAGE_SIZE. - */ - VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) - & ~cb_size_mask) >> vol->cluster_size_bits; - /* Number of compression blocks (cbs) in the wanted vcn range. */ - unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits - >> ni->itype.compressed.block_size_bits; - /* - * Number of pages required to store the uncompressed data from all - * compression blocks (cbs) overlapping @page. Due to alignment - * guarantees of start_vcn and end_vcn, no need to round up here. - */ - unsigned int nr_pages = (end_vcn - start_vcn) << - vol->cluster_size_bits >> PAGE_SHIFT; - unsigned int xpage, max_page, cur_page, cur_ofs, i; - unsigned int cb_clusters, cb_max_ofs; - int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; - struct page **pages; - int *completed_pages; - unsigned char xpage_done = 0; - - ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " - "%i.", index, cb_size, nr_pages); - /* - * Bad things happen if we get here for anything that is not an - * unnamed $DATA attribute. - */ - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - - pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); - completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS); - - /* Allocate memory to store the buffer heads we need. */ - bhs_size = cb_size / block_size * sizeof(struct buffer_head *); - bhs = kmalloc(bhs_size, GFP_NOFS); - - if (unlikely(!pages || !bhs || !completed_pages)) { - kfree(bhs); - kfree(pages); - kfree(completed_pages); - unlock_page(page); - ntfs_error(vol->sb, "Failed to allocate internal buffers."); - return -ENOMEM; - } - - /* - * We have already been given one page, this is the one we must do. - * Once again, the alignment guarantees keep it simple. - */ - offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT; - xpage = index - offset; - pages[xpage] = page; - /* - * The remaining pages need to be allocated and inserted into the page - * cache, alignment guarantees keep all the below much simpler. (-8 - */ - read_lock_irqsave(&ni->size_lock, flags); - i_size = i_size_read(VFS_I(ni)); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) - - offset; - /* Is the page fully outside i_size? (truncate in progress) */ - if (xpage >= max_page) { - kfree(bhs); - kfree(pages); - kfree(completed_pages); - zero_user(page, 0, PAGE_SIZE); - ntfs_debug("Compressed read outside i_size - truncated?"); - SetPageUptodate(page); - unlock_page(page); - return 0; - } - if (nr_pages < max_page) - max_page = nr_pages; - for (i = 0; i < max_page; i++, offset++) { - if (i != xpage) - pages[i] = grab_cache_page_nowait(mapping, offset); - page = pages[i]; - if (page) { - /* - * We only (re)read the page if it isn't already read - * in and/or dirty or we would be losing data or at - * least wasting our time. - */ - if (!PageDirty(page) && (!PageUptodate(page) || - PageError(page))) { - ClearPageError(page); - kmap(page); - continue; - } - unlock_page(page); - put_page(page); - pages[i] = NULL; - } - } - - /* - * We have the runlist, and all the destination pages we need to fill. - * Now read the first compression block. - */ - cur_page = 0; - cur_ofs = 0; - cb_clusters = ni->itype.compressed.block_clusters; -do_next_cb: - nr_cbs--; - nr_bhs = 0; - - /* Read all cb buffer heads one cluster at a time. */ - rl = NULL; - for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; - vcn++) { - bool is_retry = false; - - if (!rl) { -lock_retry_remap: - down_read(&ni->runlist.lock); - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - } else - lcn = LCN_RL_NOT_MAPPED; - ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", - (unsigned long long)vcn, - (unsigned long long)lcn); - if (lcn < 0) { - /* - * When we reach the first sparse cluster we have - * finished with the cb. - */ - if (lcn == LCN_HOLE) - break; - if (is_retry || lcn != LCN_RL_NOT_MAPPED) - goto rl_err; - is_retry = true; - /* - * Attempt to map runlist, dropping lock for the - * duration. - */ - up_read(&ni->runlist.lock); - if (!ntfs_map_runlist(ni, vcn)) - goto lock_retry_remap; - goto map_rl_err; - } - block = lcn << vol->cluster_size_bits >> block_size_bits; - /* Read the lcn from device in chunks of block_size bytes. */ - max_block = block + (vol->cluster_size >> block_size_bits); - do { - ntfs_debug("block = 0x%x.", block); - if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) - goto getblk_err; - nr_bhs++; - } while (++block < max_block); - } - - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* Setup and initiate io on all buffer heads. */ - for (i = 0; i < nr_bhs; i++) { - struct buffer_head *tbh = bhs[i]; - - if (!trylock_buffer(tbh)) - continue; - if (unlikely(buffer_uptodate(tbh))) { - unlock_buffer(tbh); - continue; - } - get_bh(tbh); - tbh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, tbh); - } - - /* Wait for io completion on all buffer heads. */ - for (i = 0; i < nr_bhs; i++) { - struct buffer_head *tbh = bhs[i]; - - if (buffer_uptodate(tbh)) - continue; - wait_on_buffer(tbh); - /* - * We need an optimization barrier here, otherwise we start - * hitting the below fixup code when accessing a loopback - * mounted ntfs partition. This indicates either there is a - * race condition in the loop driver or, more likely, gcc - * overoptimises the code without the barrier and it doesn't - * do the Right Thing(TM). - */ - barrier(); - if (unlikely(!buffer_uptodate(tbh))) { - ntfs_warning(vol->sb, "Buffer is unlocked but not " - "uptodate! Unplugging the disk queue " - "and rescheduling."); - get_bh(tbh); - io_schedule(); - put_bh(tbh); - if (unlikely(!buffer_uptodate(tbh))) - goto read_err; - ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); - } - } - - /* - * Get the compression buffer. We must not sleep any more - * until we are finished with it. - */ - spin_lock(&ntfs_cb_lock); - cb = ntfs_compression_buffer; - - BUG_ON(!cb); - - cb_pos = cb; - cb_end = cb + cb_size; - - /* Copy the buffer heads into the contiguous buffer. */ - for (i = 0; i < nr_bhs; i++) { - memcpy(cb_pos, bhs[i]->b_data, block_size); - cb_pos += block_size; - } - - /* Just a precaution. */ - if (cb_pos + 2 <= cb + cb_size) - *(u16*)cb_pos = 0; - - /* Reset cb_pos back to the beginning. */ - cb_pos = cb; - - /* We now have both source (if present) and destination. */ - ntfs_debug("Successfully read the compression block."); - - /* The last page and maximum offset within it for the current cb. */ - cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size; - cb_max_ofs = cb_max_page & ~PAGE_MASK; - cb_max_page >>= PAGE_SHIFT; - - /* Catch end of file inside a compression block. */ - if (cb_max_page > max_page) - cb_max_page = max_page; - - if (vcn == start_vcn - cb_clusters) { - /* Sparse cb, zero out page range overlapping the cb. */ - ntfs_debug("Found sparse compression block."); - /* We can sleep from now on, so we drop lock. */ - spin_unlock(&ntfs_cb_lock); - if (cb_max_ofs) - cb_max_page--; - for (; cur_page < cb_max_page; cur_page++) { - page = pages[cur_page]; - if (page) { - if (likely(!cur_ofs)) - clear_page(page_address(page)); - else - memset(page_address(page) + cur_ofs, 0, - PAGE_SIZE - - cur_ofs); - flush_dcache_page(page); - kunmap(page); - SetPageUptodate(page); - unlock_page(page); - if (cur_page == xpage) - xpage_done = 1; - else - put_page(page); - pages[cur_page] = NULL; - } - cb_pos += PAGE_SIZE - cur_ofs; - cur_ofs = 0; - if (cb_pos >= cb_end) - break; - } - /* If we have a partial final page, deal with it now. */ - if (cb_max_ofs && cb_pos < cb_end) { - page = pages[cur_page]; - if (page) - memset(page_address(page) + cur_ofs, 0, - cb_max_ofs - cur_ofs); - /* - * No need to update cb_pos at this stage: - * cb_pos += cb_max_ofs - cur_ofs; - */ - cur_ofs = cb_max_ofs; - } - } else if (vcn == start_vcn) { - /* We can't sleep so we need two stages. */ - unsigned int cur2_page = cur_page; - unsigned int cur_ofs2 = cur_ofs; - u8 *cb_pos2 = cb_pos; - - ntfs_debug("Found uncompressed compression block."); - /* Uncompressed cb, copy it to the destination pages. */ - /* - * TODO: As a big optimization, we could detect this case - * before we read all the pages and use block_read_full_folio() - * on all full pages instead (we still have to treat partial - * pages especially but at least we are getting rid of the - * synchronous io for the majority of pages. - * Or if we choose not to do the read-ahead/-behind stuff, we - * could just return block_read_full_folio(pages[xpage]) as long - * as PAGE_SIZE <= cb_size. - */ - if (cb_max_ofs) - cb_max_page--; - /* First stage: copy data into destination pages. */ - for (; cur_page < cb_max_page; cur_page++) { - page = pages[cur_page]; - if (page) - memcpy(page_address(page) + cur_ofs, cb_pos, - PAGE_SIZE - cur_ofs); - cb_pos += PAGE_SIZE - cur_ofs; - cur_ofs = 0; - if (cb_pos >= cb_end) - break; - } - /* If we have a partial final page, deal with it now. */ - if (cb_max_ofs && cb_pos < cb_end) { - page = pages[cur_page]; - if (page) - memcpy(page_address(page) + cur_ofs, cb_pos, - cb_max_ofs - cur_ofs); - cb_pos += cb_max_ofs - cur_ofs; - cur_ofs = cb_max_ofs; - } - /* We can sleep from now on, so drop lock. */ - spin_unlock(&ntfs_cb_lock); - /* Second stage: finalize pages. */ - for (; cur2_page < cb_max_page; cur2_page++) { - page = pages[cur2_page]; - if (page) { - /* - * If we are outside the initialized size, zero - * the out of bounds page range. - */ - handle_bounds_compressed_page(page, i_size, - initialized_size); - flush_dcache_page(page); - kunmap(page); - SetPageUptodate(page); - unlock_page(page); - if (cur2_page == xpage) - xpage_done = 1; - else - put_page(page); - pages[cur2_page] = NULL; - } - cb_pos2 += PAGE_SIZE - cur_ofs2; - cur_ofs2 = 0; - if (cb_pos2 >= cb_end) - break; - } - } else { - /* Compressed cb, decompress it into the destination page(s). */ - unsigned int prev_cur_page = cur_page; - - ntfs_debug("Found compressed compression block."); - err = ntfs_decompress(pages, completed_pages, &cur_page, - &cur_ofs, cb_max_page, cb_max_ofs, xpage, - &xpage_done, cb_pos, cb_size - (cb_pos - cb), - i_size, initialized_size); - /* - * We can sleep from now on, lock already dropped by - * ntfs_decompress(). - */ - if (err) { - ntfs_error(vol->sb, "ntfs_decompress() failed in inode " - "0x%lx with error code %i. Skipping " - "this compression block.", - ni->mft_no, -err); - /* Release the unfinished pages. */ - for (; prev_cur_page < cur_page; prev_cur_page++) { - page = pages[prev_cur_page]; - if (page) { - flush_dcache_page(page); - kunmap(page); - unlock_page(page); - if (prev_cur_page != xpage) - put_page(page); - pages[prev_cur_page] = NULL; - } - } - } - } - - /* Release the buffer heads. */ - for (i = 0; i < nr_bhs; i++) - brelse(bhs[i]); - - /* Do we have more work to do? */ - if (nr_cbs) - goto do_next_cb; - - /* We no longer need the list of buffer heads. */ - kfree(bhs); - - /* Clean up if we have any pages left. Should never happen. */ - for (cur_page = 0; cur_page < max_page; cur_page++) { - page = pages[cur_page]; - if (page) { - ntfs_error(vol->sb, "Still have pages left! " - "Terminating them with extreme " - "prejudice. Inode 0x%lx, page index " - "0x%lx.", ni->mft_no, page->index); - flush_dcache_page(page); - kunmap(page); - unlock_page(page); - if (cur_page != xpage) - put_page(page); - pages[cur_page] = NULL; - } - } - - /* We no longer need the list of pages. */ - kfree(pages); - kfree(completed_pages); - - /* If we have completed the requested page, we return success. */ - if (likely(xpage_done)) - return 0; - - ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? - "EOVERFLOW" : (!err ? "EIO" : "unknown error")); - return err < 0 ? err : -EIO; - -read_err: - ntfs_error(vol->sb, "IO error while reading compressed data."); - /* Release the buffer heads. */ - for (i = 0; i < nr_bhs; i++) - brelse(bhs[i]); - goto err_out; - -map_rl_err: - ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " - "compression block."); - goto err_out; - -rl_err: - up_read(&ni->runlist.lock); - ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " - "compression block."); - goto err_out; - -getblk_err: - up_read(&ni->runlist.lock); - ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); - -err_out: - kfree(bhs); - for (i = cur_page; i < max_page; i++) { - page = pages[i]; - if (page) { - flush_dcache_page(page); - kunmap(page); - unlock_page(page); - if (i != xpage) - put_page(page); - } - } - kfree(pages); - kfree(completed_pages); - return -EIO; -} diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c deleted file mode 100644 index a3c1c5656f8f..000000000000 --- a/fs/ntfs/debug.c +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include "debug.h" - -/** - * __ntfs_warning - output a warning to the syslog - * @function: name of function outputting the warning - * @sb: super block of mounted ntfs filesystem - * @fmt: warning string containing format specifications - * @...: a variable number of arguments specified in @fmt - * - * Outputs a warning to the syslog for the mounted ntfs filesystem described - * by @sb. - * - * @fmt and the corresponding @... is printf style format string containing - * the warning string and the corresponding format arguments, respectively. - * - * @function is the name of the function from which __ntfs_warning is being - * called. - * - * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead - * as this provides the @function parameter automatically. - */ -void __ntfs_warning(const char *function, const struct super_block *sb, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int flen = 0; - -#ifndef DEBUG - if (!printk_ratelimit()) - return; -#endif - if (function) - flen = strlen(function); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (sb) - pr_warn("(device %s): %s(): %pV\n", - sb->s_id, flen ? function : "", &vaf); - else - pr_warn("%s(): %pV\n", flen ? function : "", &vaf); - va_end(args); -} - -/** - * __ntfs_error - output an error to the syslog - * @function: name of function outputting the error - * @sb: super block of mounted ntfs filesystem - * @fmt: error string containing format specifications - * @...: a variable number of arguments specified in @fmt - * - * Outputs an error to the syslog for the mounted ntfs filesystem described - * by @sb. - * - * @fmt and the corresponding @... is printf style format string containing - * the error string and the corresponding format arguments, respectively. - * - * @function is the name of the function from which __ntfs_error is being - * called. - * - * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead - * as this provides the @function parameter automatically. - */ -void __ntfs_error(const char *function, const struct super_block *sb, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int flen = 0; - -#ifndef DEBUG - if (!printk_ratelimit()) - return; -#endif - if (function) - flen = strlen(function); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (sb) - pr_err("(device %s): %s(): %pV\n", - sb->s_id, flen ? function : "", &vaf); - else - pr_err("%s(): %pV\n", flen ? function : "", &vaf); - va_end(args); -} - -#ifdef DEBUG - -/* If 1, output debug messages, and if 0, don't. */ -int debug_msgs = 0; - -void __ntfs_debug(const char *file, int line, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int flen = 0; - - if (!debug_msgs) - return; - if (function) - flen = strlen(function); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); - va_end(args); -} - -/* Dump a runlist. Caller has to provide synchronisation for @rl. */ -void ntfs_debug_dump_runlist(const runlist_element *rl) -{ - int i; - const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED", - "LCN_ENOENT ", "LCN_unknown " }; - - if (!debug_msgs) - return; - pr_debug("Dumping runlist (values in hex):\n"); - if (!rl) { - pr_debug("Run list not present.\n"); - return; - } - pr_debug("VCN LCN Run length\n"); - for (i = 0; ; i++) { - LCN lcn = (rl + i)->lcn; - - if (lcn < (LCN)0) { - int index = -lcn - 1; - - if (index > -LCN_ENOENT - 1) - index = 3; - pr_debug("%-16Lx %s %-16Lx%s\n", - (long long)(rl + i)->vcn, lcn_str[index], - (long long)(rl + i)->length, - (rl + i)->length ? "" : - " (runlist end)"); - } else - pr_debug("%-16Lx %-16Lx %-16Lx%s\n", - (long long)(rl + i)->vcn, - (long long)(rl + i)->lcn, - (long long)(rl + i)->length, - (rl + i)->length ? "" : - " (runlist end)"); - if (!(rl + i)->length) - break; - } -} - -#endif diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h deleted file mode 100644 index 6fdef388f129..000000000000 --- a/fs/ntfs/debug.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_DEBUG_H -#define _LINUX_NTFS_DEBUG_H - -#include <linux/fs.h> - -#include "runlist.h" - -#ifdef DEBUG - -extern int debug_msgs; - -extern __printf(4, 5) -void __ntfs_debug(const char *file, int line, const char *function, - const char *format, ...); -/** - * ntfs_debug - write a debug level message to syslog - * @f: a printf format string containing the message - * @...: the variables to substitute into @f - * - * ntfs_debug() writes a DEBUG level message to the syslog but only if the - * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP. - */ -#define ntfs_debug(f, a...) \ - __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) - -extern void ntfs_debug_dump_runlist(const runlist_element *rl); - -#else /* !DEBUG */ - -#define ntfs_debug(fmt, ...) \ -do { \ - if (0) \ - no_printk(fmt, ##__VA_ARGS__); \ -} while (0) - -#define ntfs_debug_dump_runlist(rl) do {} while (0) - -#endif /* !DEBUG */ - -extern __printf(3, 4) -void __ntfs_warning(const char *function, const struct super_block *sb, - const char *fmt, ...); -#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) - -extern __printf(3, 4) -void __ntfs_error(const char *function, const struct super_block *sb, - const char *fmt, ...); -#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) - -#endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c deleted file mode 100644 index 629723a8d712..000000000000 --- a/fs/ntfs/dir.c +++ /dev/null @@ -1,1540 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/buffer_head.h> -#include <linux/slab.h> -#include <linux/blkdev.h> - -#include "dir.h" -#include "aops.h" -#include "attrib.h" -#include "mft.h" -#include "debug.h" -#include "ntfs.h" - -/* - * The little endian Unicode string $I30 as a global constant. - */ -ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), - cpu_to_le16('3'), cpu_to_le16('0'), 0 }; - -/** - * ntfs_lookup_inode_by_name - find an inode in a directory given its name - * @dir_ni: ntfs inode of the directory in which to search for the name - * @uname: Unicode name for which to search in the directory - * @uname_len: length of the name @uname in Unicode characters - * @res: return the found file name if necessary (see below) - * - * Look for an inode with name @uname in the directory with inode @dir_ni. - * ntfs_lookup_inode_by_name() walks the contents of the directory looking for - * the Unicode name. If the name is found in the directory, the corresponding - * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it - * is a 64-bit number containing the sequence number. - * - * On error, a negative value is returned corresponding to the error code. In - * particular if the inode is not found -ENOENT is returned. Note that you - * can't just check the return value for being negative, you have to check the - * inode number for being negative which you can extract using MREC(return - * value). - * - * Note, @uname_len does not include the (optional) terminating NULL character. - * - * Note, we look for a case sensitive match first but we also look for a case - * insensitive match at the same time. If we find a case insensitive match, we - * save that for the case that we don't find an exact match, where we return - * the case insensitive match and setup @res (which we allocate!) with the mft - * reference, the file name type, length and with a copy of the little endian - * Unicode file name itself. If we match a file name which is in the DOS name - * space, we only return the mft reference and file name type in @res. - * ntfs_lookup() then uses this to find the long file name in the inode itself. - * This is to avoid polluting the dcache with short file names. We want them to - * work but we don't care for how quickly one can access them. This also fixes - * the dcache aliasing issues. - * - * Locking: - Caller must hold i_mutex on the directory. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, - const int uname_len, ntfs_name **res) -{ - ntfs_volume *vol = dir_ni->vol; - struct super_block *sb = vol->sb; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end; - u64 mref; - ntfs_attr_search_ctx *ctx; - int err, rc; - VCN vcn, old_vcn; - struct address_space *ia_mapping; - struct page *page; - u8 *kaddr; - ntfs_name *name = NULL; - - BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode)); - BUG_ON(NInoAttr(dir_ni)); - /* Get hold of the mft record for the directory. */ - m = map_mft_record(dir_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return ERR_MREF(PTR_ERR(m)); - } - ctx = ntfs_attr_get_search_ctx(dir_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in " - "directory inode 0x%lx.", - dir_ni->mft_no); - err = -EIO; - } - goto err_out; - } - /* Get to the index root value (it's been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) - goto dir_err_out; - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * We perform a case sensitive comparison and if that matches - * we are done and return the mft reference of the inode (i.e. - * the inode number together with the sequence number for - * consistency checking). We convert it to cpu format before - * returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { -found_it: - /* - * We have a perfect match, so we don't need to care - * about having matched imperfectly before, so we can - * free name and set *res to NULL. - * However, if the perfect match is a short file name, - * we need to signal this through *res, so that - * ntfs_lookup() can fix dcache aliasing issues. - * As an optimization we just reuse an existing - * allocation of *res. - */ - if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { - if (!name) { - name = kmalloc(sizeof(ntfs_name), - GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto err_out; - } - } - name->mref = le64_to_cpu( - ie->data.dir.indexed_file); - name->type = FILE_NAME_DOS; - name->len = 0; - *res = name; - } else { - kfree(name); - *res = NULL; - } - mref = le64_to_cpu(ie->data.dir.indexed_file); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return mref; - } - /* - * For a case insensitive mount, we also perform a case - * insensitive comparison (provided the file name is not in the - * POSIX namespace). If the comparison matches, and the name is - * in the WIN32 namespace, we cache the filename in *res so - * that the caller, ntfs_lookup(), can work on it. If the - * comparison matches, and the name is in the DOS namespace, we - * only cache the mft reference and the file name type (we set - * the name length to zero for simplicity). - */ - if (!NVolCaseSensitive(vol) && - ie->key.file_name.file_name_type && - ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - IGNORE_CASE, vol->upcase, vol->upcase_len)) { - int name_size = sizeof(ntfs_name); - u8 type = ie->key.file_name.file_name_type; - u8 len = ie->key.file_name.file_name_length; - - /* Only one case insensitive matching name allowed. */ - if (name) { - ntfs_error(sb, "Found already allocated name " - "in phase 1. Please run chkdsk " - "and if that doesn't find any " - "errors please report you saw " - "this message to " - "linux-ntfs-dev@lists." - "sourceforge.net."); - goto dir_err_out; - } - - if (type != FILE_NAME_DOS) - name_size += len * sizeof(ntfschar); - name = kmalloc(name_size, GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto err_out; - } - name->mref = le64_to_cpu(ie->data.dir.indexed_file); - name->type = type; - if (type != FILE_NAME_DOS) { - name->len = len; - memcpy(name->name, ie->key.file_name.file_name, - len * sizeof(ntfschar)); - } else - name->len = 0; - *res = name; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it; - } - /* - * We have finished with this index without success. Check for the - * presence of a child node and if not present return -ENOENT, unless - * we have got a matching name cached in name in which case return the - * mft reference associated with it. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - if (name) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return name->mref; - } - ntfs_debug("Entry not found."); - err = -ENOENT; - goto err_out; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(dir_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Directory inode 0x%lx is " - "corrupt or driver bug.", dir_ni->mft_no); - goto err_out; - } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - ia_mapping = VFS_I(dir_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - m = NULL; - ctx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map directory index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", dir_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug.", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - dir_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)vcn, dir_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - dir_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + dir_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - dir_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds check. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) { - ntfs_error(sb, "Index entry out of bounds in " - "directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * We perform a case sensitive comparison and if that matches - * we are done and return the mft reference of the inode (i.e. - * the inode number together with the sequence number for - * consistency checking). We convert it to cpu format before - * returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { -found_it2: - /* - * We have a perfect match, so we don't need to care - * about having matched imperfectly before, so we can - * free name and set *res to NULL. - * However, if the perfect match is a short file name, - * we need to signal this through *res, so that - * ntfs_lookup() can fix dcache aliasing issues. - * As an optimization we just reuse an existing - * allocation of *res. - */ - if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { - if (!name) { - name = kmalloc(sizeof(ntfs_name), - GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto unm_err_out; - } - } - name->mref = le64_to_cpu( - ie->data.dir.indexed_file); - name->type = FILE_NAME_DOS; - name->len = 0; - *res = name; - } else { - kfree(name); - *res = NULL; - } - mref = le64_to_cpu(ie->data.dir.indexed_file); - unlock_page(page); - ntfs_unmap_page(page); - return mref; - } - /* - * For a case insensitive mount, we also perform a case - * insensitive comparison (provided the file name is not in the - * POSIX namespace). If the comparison matches, and the name is - * in the WIN32 namespace, we cache the filename in *res so - * that the caller, ntfs_lookup(), can work on it. If the - * comparison matches, and the name is in the DOS namespace, we - * only cache the mft reference and the file name type (we set - * the name length to zero for simplicity). - */ - if (!NVolCaseSensitive(vol) && - ie->key.file_name.file_name_type && - ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - IGNORE_CASE, vol->upcase, vol->upcase_len)) { - int name_size = sizeof(ntfs_name); - u8 type = ie->key.file_name.file_name_type; - u8 len = ie->key.file_name.file_name_length; - - /* Only one case insensitive matching name allowed. */ - if (name) { - ntfs_error(sb, "Found already allocated name " - "in phase 2. Please run chkdsk " - "and if that doesn't find any " - "errors please report you saw " - "this message to " - "linux-ntfs-dev@lists." - "sourceforge.net."); - unlock_page(page); - ntfs_unmap_page(page); - goto dir_err_out; - } - - if (type != FILE_NAME_DOS) - name_size += len * sizeof(ntfschar); - name = kmalloc(name_size, GFP_NOFS); - if (!name) { - err = -ENOMEM; - goto unm_err_out; - } - name->mref = le64_to_cpu(ie->data.dir.indexed_file); - name->type = type; - if (type != FILE_NAME_DOS) { - name->len = len; - memcpy(name->name, ie->key.file_name.file_name, - len * sizeof(ntfschar)); - } else - name->len = 0; - *res = name; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it2; - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node. - */ - if (ie->flags & INDEX_ENTRY_NODE) { - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in " - "a leaf node in directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((sle64*)((u8*)ie + - le16_to_cpu(ie->length) - 8)); - if (vcn >= 0) { - /* If vcn is in the same page cache page as old_vcn we - * recycle the mapped page. */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in directory inode " - "0x%lx.", dir_ni->mft_no); - goto unm_err_out; - } - /* - * No child node present, return -ENOENT, unless we have got a matching - * name cached in name in which case return the mft reference - * associated with it. - */ - if (name) { - unlock_page(page); - ntfs_unmap_page(page); - return name->mref; - } - ntfs_debug("Entry not found."); - err = -ENOENT; -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(dir_ni); - if (name) { - kfree(name); - *res = NULL; - } - return ERR_MREF(err); -dir_err_out: - ntfs_error(sb, "Corrupt directory. Aborting lookup."); - goto err_out; -} - -#if 0 - -// TODO: (AIA) -// The algorithm embedded in this code will be required for the time when we -// want to support adding of entries to directories, where we require correct -// collation of file names in order not to cause corruption of the filesystem. - -/** - * ntfs_lookup_inode_by_name - find an inode in a directory given its name - * @dir_ni: ntfs inode of the directory in which to search for the name - * @uname: Unicode name for which to search in the directory - * @uname_len: length of the name @uname in Unicode characters - * - * Look for an inode with name @uname in the directory with inode @dir_ni. - * ntfs_lookup_inode_by_name() walks the contents of the directory looking for - * the Unicode name. If the name is found in the directory, the corresponding - * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it - * is a 64-bit number containing the sequence number. - * - * On error, a negative value is returned corresponding to the error code. In - * particular if the inode is not found -ENOENT is returned. Note that you - * can't just check the return value for being negative, you have to check the - * inode number for being negative which you can extract using MREC(return - * value). - * - * Note, @uname_len does not include the (optional) terminating NULL character. - */ -u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, - const int uname_len) -{ - ntfs_volume *vol = dir_ni->vol; - struct super_block *sb = vol->sb; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end; - u64 mref; - ntfs_attr_search_ctx *ctx; - int err, rc; - IGNORE_CASE_BOOL ic; - VCN vcn, old_vcn; - struct address_space *ia_mapping; - struct page *page; - u8 *kaddr; - - /* Get hold of the mft record for the directory. */ - m = map_mft_record(dir_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return ERR_MREF(PTR_ERR(m)); - } - ctx = ntfs_attr_get_search_ctx(dir_ni, m); - if (!ctx) { - err = -ENOMEM; - goto err_out; - } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in " - "directory inode 0x%lx.", - dir_ni->mft_no); - err = -EIO; - } - goto err_out; - } - /* Get to the index root value (it's been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) - goto dir_err_out; - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * If the current entry has a name type of POSIX, the name is - * case sensitive and not otherwise. This has the effect of us - * not being able to access any POSIX file names which collate - * after the non-POSIX one when they only differ in case, but - * anyone doing screwy stuff like that deserves to burn in - * hell... Doing that kind of stuff on NT4 actually causes - * corruption on the partition even when using SP6a and Linux - * is not involved at all. - */ - ic = ie->key.file_name.file_name_type ? IGNORE_CASE : - CASE_SENSITIVE; - /* - * If the names match perfectly, we are done and return the - * mft reference of the inode (i.e. the inode number together - * with the sequence number for consistency checking. We - * convert it to cpu format before returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, ic, - vol->upcase, vol->upcase_len)) { -found_it: - mref = le64_to_cpu(ie->data.dir.indexed_file); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return mref; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it; - } - /* - * We have finished with this index without success. Check for the - * presence of a child node. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - /* No child node, return -ENOENT. */ - err = -ENOENT; - goto err_out; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(dir_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Directory inode 0x%lx is " - "corrupt or driver bug.", dir_ni->mft_no); - goto err_out; - } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); - ia_mapping = VFS_I(dir_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - m = NULL; - ctx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map directory index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", dir_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug.", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - dir_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)vcn, dir_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - dir_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + dir_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - dir_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds check. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) { - ntfs_error(sb, "Index entry out of bounds in " - "directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * If the current entry has a name type of POSIX, the name is - * case sensitive and not otherwise. This has the effect of us - * not being able to access any POSIX file names which collate - * after the non-POSIX one when they only differ in case, but - * anyone doing screwy stuff like that deserves to burn in - * hell... Doing that kind of stuff on NT4 actually causes - * corruption on the partition even when using SP6a and Linux - * is not involved at all. - */ - ic = ie->key.file_name.file_name_type ? IGNORE_CASE : - CASE_SENSITIVE; - /* - * If the names match perfectly, we are done and return the - * mft reference of the inode (i.e. the inode number together - * with the sequence number for consistency checking. We - * convert it to cpu format before returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, ic, - vol->upcase, vol->upcase_len)) { -found_it2: - mref = le64_to_cpu(ie->data.dir.indexed_file); - unlock_page(page); - ntfs_unmap_page(page); - return mref; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it2; - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node. - */ - if (ie->flags & INDEX_ENTRY_NODE) { - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in " - "a leaf node in directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); - if (vcn >= 0) { - /* If vcn is in the same page cache page as old_vcn we - * recycle the mapped page. */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in directory inode " - "0x%lx.", dir_ni->mft_no); - goto unm_err_out; - } - /* No child node, return -ENOENT. */ - ntfs_debug("Entry not found."); - err = -ENOENT; -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(dir_ni); - return ERR_MREF(err); -dir_err_out: - ntfs_error(sb, "Corrupt directory. Aborting lookup."); - goto err_out; -} - -#endif - -/** - * ntfs_filldir - ntfs specific filldir method - * @vol: current ntfs volume - * @ndir: ntfs inode of current directory - * @ia_page: page in which the index allocation buffer @ie is in resides - * @ie: current index entry - * @name: buffer to use for the converted name - * @actor: what to feed the entries to - * - * Convert the Unicode @name to the loaded NLS and pass it to the @filldir - * callback. - * - * If @ia_page is not NULL it is the locked page containing the index - * allocation block containing the index entry @ie. - * - * Note, we drop (and then reacquire) the page lock on @ia_page across the - * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup - * since ntfs_lookup() will lock the same page. As an optimization, we do not - * retake the lock if we are returning a non-zero value as ntfs_readdir() - * would need to drop the lock immediately anyway. - */ -static inline int ntfs_filldir(ntfs_volume *vol, - ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, - u8 *name, struct dir_context *actor) -{ - unsigned long mref; - int name_len; - unsigned dt_type; - FILE_NAME_TYPE_FLAGS name_type; - - name_type = ie->key.file_name.file_name_type; - if (name_type == FILE_NAME_DOS) { - ntfs_debug("Skipping DOS name space entry."); - return 0; - } - if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) { - ntfs_debug("Skipping root directory self reference entry."); - return 0; - } - if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user && - !NVolShowSystemFiles(vol)) { - ntfs_debug("Skipping system file."); - return 0; - } - name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, &name, - NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1); - if (name_len <= 0) { - ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.", - (long long)MREF_LE(ie->data.dir.indexed_file)); - return 0; - } - if (ie->key.file_name.file_attributes & - FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT) - dt_type = DT_DIR; - else - dt_type = DT_REG; - mref = MREF_LE(ie->data.dir.indexed_file); - /* - * Drop the page lock otherwise we deadlock with NFS when it calls - * ->lookup since ntfs_lookup() will lock the same page. - */ - if (ia_page) - unlock_page(ia_page); - ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " - "0x%lx, DT_%s.", name, name_len, actor->pos, mref, - dt_type == DT_DIR ? "DIR" : "REG"); - if (!dir_emit(actor, name, name_len, mref, dt_type)) - return 1; - /* Relock the page but not if we are aborting ->readdir. */ - if (ia_page) - lock_page(ia_page); - return 0; -} - -/* - * We use the same basic approach as the old NTFS driver, i.e. we parse the - * index root entries and then the index allocation entries that are marked - * as in use in the index bitmap. - * - * While this will return the names in random order this doesn't matter for - * ->readdir but OTOH results in a faster ->readdir. - * - * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS - * parts (e.g. ->f_pos and ->i_size, and it also protects against directory - * modifications). - * - * Locking: - Caller must hold i_mutex on the directory. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -static int ntfs_readdir(struct file *file, struct dir_context *actor) -{ - s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; - loff_t i_size; - struct inode *bmp_vi, *vdir = file_inode(file); - struct super_block *sb = vdir->i_sb; - ntfs_inode *ndir = NTFS_I(vdir); - ntfs_volume *vol = NTFS_SB(sb); - MFT_RECORD *m; - INDEX_ROOT *ir = NULL; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *name = NULL; - int rc, err, ir_pos, cur_bmp_pos; - struct address_space *ia_mapping, *bmp_mapping; - struct page *bmp_page = NULL, *ia_page = NULL; - u8 *kaddr, *bmp, *index_end; - ntfs_attr_search_ctx *ctx; - - ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", - vdir->i_ino, actor->pos); - rc = err = 0; - /* Are we at end of dir yet? */ - i_size = i_size_read(vdir); - if (actor->pos >= i_size + vol->mft_record_size) - return 0; - /* Emulate . and .. for all directories. */ - if (!dir_emit_dots(file, actor)) - return 0; - m = NULL; - ctx = NULL; - /* - * Allocate a buffer to store the current name being processed - * converted to format determined by current NLS. - */ - name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); - if (unlikely(!name)) { - err = -ENOMEM; - goto err_out; - } - /* Are we jumping straight into the index allocation attribute? */ - if (actor->pos >= vol->mft_record_size) - goto skip_index_root; - /* Get hold of the mft record for the directory. */ - m = map_mft_record(ndir); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ndir, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - /* Get the offset into the index root attribute. */ - ir_pos = (s64)actor->pos; - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - ntfs_error(sb, "Index root attribute missing in directory " - "inode 0x%lx.", vdir->i_ino); - goto err_out; - } - /* - * Copy the index root attribute value to a buffer so that we can put - * the search context and unmap the mft record before calling the - * filldir() callback. We need to do this because of NFSd which calls - * ->lookup() from its filldir callback() and this causes NTFS to - * deadlock as ntfs_lookup() maps the mft record of the directory and - * we have got it mapped here already. The only solution is for us to - * unmap the mft record here so that a call to ntfs_lookup() is able to - * map the mft record without deadlocking. - */ - rc = le32_to_cpu(ctx->attr->data.resident.value_length); - ir = kmalloc(rc, GFP_NOFS); - if (unlikely(!ir)) { - err = -ENOMEM; - goto err_out; - } - /* Copy the index root value (it has been verified in read_inode). */ - memcpy(ir, (u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), rc); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ndir); - ctx = NULL; - m = NULL; - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry or until filldir tells us it has had enough - * or signals an error (both covered by the rc test). - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir); - /* Bounds checks. */ - if (unlikely((u8*)ie < (u8*)ir || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end)) - goto err_out; - /* The last entry cannot contain a name. */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Skip index root entry if continuing previous readdir. */ - if (ir_pos > (u8*)ie - (u8*)ir) - continue; - /* Advance the position even if going to skip the entry. */ - actor->pos = (u8*)ie - (u8*)ir; - /* Submit the name to the filldir callback. */ - rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor); - if (rc) { - kfree(ir); - goto abort; - } - } - /* We are done with the index root and can free the buffer. */ - kfree(ir); - ir = NULL; - /* If there is no index allocation attribute we are finished. */ - if (!NInoIndexAllocPresent(ndir)) - goto EOD; - /* Advance fpos to the beginning of the index allocation. */ - actor->pos = vol->mft_record_size; -skip_index_root: - kaddr = NULL; - prev_ia_pos = -1LL; - /* Get the offset into the index allocation attribute. */ - ia_pos = (s64)actor->pos - vol->mft_record_size; - ia_mapping = vdir->i_mapping; - ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); - bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); - if (IS_ERR(bmp_vi)) { - ntfs_error(sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bmp_vi); - goto err_out; - } - bmp_mapping = bmp_vi->i_mapping; - /* Get the starting bitmap bit position and sanity check it. */ - bmp_pos = ia_pos >> ndir->itype.index.block_size_bits; - if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { - ntfs_error(sb, "Current index allocation position exceeds " - "index bitmap size."); - goto iput_err_out; - } - /* Get the starting bit position in the current bitmap page. */ - cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1); - bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1); -get_next_bmp_page: - ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx", - (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT), - (unsigned long long)bmp_pos & - (unsigned long long)((PAGE_SIZE * 8) - 1)); - bmp_page = ntfs_map_page(bmp_mapping, - bmp_pos >> (3 + PAGE_SHIFT)); - if (IS_ERR(bmp_page)) { - ntfs_error(sb, "Reading index bitmap failed."); - err = PTR_ERR(bmp_page); - bmp_page = NULL; - goto iput_err_out; - } - bmp = (u8*)page_address(bmp_page); - /* Find next index block in use. */ - while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) { -find_next_index_buffer: - cur_bmp_pos++; - /* - * If we have reached the end of the bitmap page, get the next - * page, and put away the old one. - */ - if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) { - ntfs_unmap_page(bmp_page); - bmp_pos += PAGE_SIZE * 8; - cur_bmp_pos = 0; - goto get_next_bmp_page; - } - /* If we have reached the end of the bitmap, we are done. */ - if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size)) - goto unm_EOD; - ia_pos = (bmp_pos + cur_bmp_pos) << - ndir->itype.index.block_size_bits; - } - ntfs_debug("Handling index buffer 0x%llx.", - (unsigned long long)bmp_pos + cur_bmp_pos); - /* If the current index buffer is in the same page we reuse the page. */ - if ((prev_ia_pos & (s64)PAGE_MASK) != - (ia_pos & (s64)PAGE_MASK)) { - prev_ia_pos = ia_pos; - if (likely(ia_page != NULL)) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - /* - * Map the page cache page containing the current ia_pos, - * reading it from disk if necessary. - */ - ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT); - if (IS_ERR(ia_page)) { - ntfs_error(sb, "Reading index allocation data failed."); - err = PTR_ERR(ia_page); - ia_page = NULL; - goto err_out; - } - lock_page(ia_page); - kaddr = (u8*)page_address(ia_page); - } - /* Get the current index buffer. */ - ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK & - ~(s64)(ndir->itype.index.block_size - 1))); - /* Bounds checks. */ - if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", vdir->i_ino); - goto err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos & - ~(s64)(ndir->itype.index.block_size - 1)) >> - ndir->itype.index.vcn_size_bits)) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug. ", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 != - ndir->itype.index.block_size)) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino, - le32_to_cpu(ia->index.allocated_size) + 0x18, - ndir->itype.index.block_size); - goto err_out; - } - index_end = (u8*)ia + ndir->itype.index.block_size; - if (unlikely(index_end > kaddr + PAGE_SIZE)) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1); - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - /* The first index entry in this index buffer. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry or until filldir tells us it has had enough - * or signals an error (both covered by the rc test). - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - ntfs_debug("In index allocation, offset 0x%llx.", - (unsigned long long)ia_start + - (unsigned long long)((u8*)ie - (u8*)ia)); - /* Bounds checks. */ - if (unlikely((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end)) - goto err_out; - /* The last entry cannot contain a name. */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Skip index block entry if continuing previous readdir. */ - if (ia_pos - ia_start > (u8*)ie - (u8*)ia) - continue; - /* Advance the position even if going to skip the entry. */ - actor->pos = (u8*)ie - (u8*)ia + - (sle64_to_cpu(ia->index_block_vcn) << - ndir->itype.index.vcn_size_bits) + - vol->mft_record_size; - /* - * Submit the name to the @filldir callback. Note, - * ntfs_filldir() drops the lock on @ia_page but it retakes it - * before returning, unless a non-zero value is returned in - * which case the page is left unlocked. - */ - rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor); - if (rc) { - /* @ia_page is already unlocked in this case. */ - ntfs_unmap_page(ia_page); - ntfs_unmap_page(bmp_page); - iput(bmp_vi); - goto abort; - } - } - goto find_next_index_buffer; -unm_EOD: - if (ia_page) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - ntfs_unmap_page(bmp_page); - iput(bmp_vi); -EOD: - /* We are finished, set fpos to EOD. */ - actor->pos = i_size + vol->mft_record_size; -abort: - kfree(name); - return 0; -err_out: - if (bmp_page) { - ntfs_unmap_page(bmp_page); -iput_err_out: - iput(bmp_vi); - } - if (ia_page) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - kfree(ir); - kfree(name); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ndir); - if (!err) - err = -EIO; - ntfs_debug("Failed. Returning error code %i.", -err); - return err; -} - -/** - * ntfs_dir_open - called when an inode is about to be opened - * @vi: inode to be opened - * @filp: file structure describing the inode - * - * Limit directory size to the page cache limit on architectures where unsigned - * long is 32-bits. This is the most we can do for now without overflowing the - * page cache page index. Doing it this way means we don't run into problems - * because of existing too large directories. It would be better to allow the - * user to read the accessible part of the directory but I doubt very much - * anyone is going to hit this check on a 32-bit architecture, so there is no - * point in adding the extra complexity required to support this. - * - * On 64-bit architectures, the check is hopefully optimized away by the - * compiler. - */ -static int ntfs_dir_open(struct inode *vi, struct file *filp) -{ - if (sizeof(unsigned long) < 8) { - if (i_size_read(vi) > MAX_LFS_FILESIZE) - return -EFBIG; - } - return 0; -} - -#ifdef NTFS_RW - -/** - * ntfs_dir_fsync - sync a directory to disk - * @filp: directory to be synced - * @start: offset in bytes of the beginning of data range to sync - * @end: offset in bytes of the end of data range (inclusive) - * @datasync: if non-zero only flush user data and not metadata - * - * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and - * msync system calls. This function is based on file.c::ntfs_file_fsync(). - * - * Write the mft record and all associated extent mft records as well as the - * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device. - * - * If @datasync is true, we do not wait on the inode(s) to be written out - * but we always wait on the page cache pages to be written out. - * - * Note: In the past @filp could be NULL so we ignore it as we don't need it - * anyway. - * - * Locking: Caller must hold i_mutex on the inode. - * - * TODO: We should probably also write all attribute/index inodes associated - * with this inode but since we have no simple way of getting to them we ignore - * this problem for now. We do write the $BITMAP attribute if it is present - * which is the important one for a directory so things are not too bad. - */ -static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *bmp_vi, *vi = filp->f_mapping->host; - int err, ret; - ntfs_attr na; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - - err = file_write_and_wait_range(filp, start, end); - if (err) - return err; - inode_lock(vi); - - BUG_ON(!S_ISDIR(vi->i_mode)); - /* If the bitmap attribute inode is in memory sync it, too. */ - na.mft_no = vi->i_ino; - na.type = AT_BITMAP; - na.name = I30; - na.name_len = 4; - bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na); - if (bmp_vi) { - write_inode_now(bmp_vi, !datasync); - iput(bmp_vi); - } - ret = __ntfs_write_inode(vi, 1); - write_inode_now(vi, !datasync); - err = sync_blockdev(vi->i_sb->s_bdev); - if (unlikely(err && !ret)) - ret = err; - if (likely(!ret)) - ntfs_debug("Done."); - else - ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " - "%u.", datasync ? "data" : "", vi->i_ino, -ret); - inode_unlock(vi); - return ret; -} - -#endif /* NTFS_RW */ - -WRAP_DIR_ITER(ntfs_readdir) // FIXME! -const struct file_operations ntfs_dir_ops = { - .llseek = generic_file_llseek, /* Seek inside directory. */ - .read = generic_read_dir, /* Return -EISDIR. */ - .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */ -#ifdef NTFS_RW - .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ -#endif /* NTFS_RW */ - /*.ioctl = ,*/ /* Perform function on the - mounted filesystem. */ - .open = ntfs_dir_open, /* Open directory. */ -}; diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h deleted file mode 100644 index 0e326753df40..000000000000 --- a/fs/ntfs/dir.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. - * - * Copyright (c) 2002-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_DIR_H -#define _LINUX_NTFS_DIR_H - -#include "layout.h" -#include "inode.h" -#include "types.h" - -/* - * ntfs_name is used to return the file name to the caller of - * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup()) - * to be able to deal with dcache aliasing issues. - */ -typedef struct { - MFT_REF mref; - FILE_NAME_TYPE_FLAGS type; - u8 len; - ntfschar name[0]; -} __attribute__ ((__packed__)) ntfs_name; - -/* The little endian Unicode string $I30 as a global constant. */ -extern ntfschar I30[5]; - -extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, - const ntfschar *uname, const int uname_len, ntfs_name **res); - -#endif /* _LINUX_NTFS_FS_DIR_H */ diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h deleted file mode 100644 index f30c139bf9ae..000000000000 --- a/fs/ntfs/endian.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * endian.h - Defines for endianness handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_ENDIAN_H -#define _LINUX_NTFS_ENDIAN_H - -#include <asm/byteorder.h> -#include "types.h" - -/* - * Signed endianness conversion functions. - */ - -static inline s16 sle16_to_cpu(sle16 x) -{ - return le16_to_cpu((__force le16)x); -} - -static inline s32 sle32_to_cpu(sle32 x) -{ - return le32_to_cpu((__force le32)x); -} - -static inline s64 sle64_to_cpu(sle64 x) -{ - return le64_to_cpu((__force le64)x); -} - -static inline s16 sle16_to_cpup(sle16 *x) -{ - return le16_to_cpu(*(__force le16*)x); -} - -static inline s32 sle32_to_cpup(sle32 *x) -{ - return le32_to_cpu(*(__force le32*)x); -} - -static inline s64 sle64_to_cpup(sle64 *x) -{ - return le64_to_cpu(*(__force le64*)x); -} - -static inline sle16 cpu_to_sle16(s16 x) -{ - return (__force sle16)cpu_to_le16(x); -} - -static inline sle32 cpu_to_sle32(s32 x) -{ - return (__force sle32)cpu_to_le32(x); -} - -static inline sle64 cpu_to_sle64(s64 x) -{ - return (__force sle64)cpu_to_le64(x); -} - -static inline sle16 cpu_to_sle16p(s16 *x) -{ - return (__force sle16)cpu_to_le16(*x); -} - -static inline sle32 cpu_to_sle32p(s32 *x) -{ - return (__force sle32)cpu_to_le32(*x); -} - -static inline sle64 cpu_to_sle64p(s64 *x) -{ - return (__force sle64)cpu_to_le64(*x); -} - -#endif /* _LINUX_NTFS_ENDIAN_H */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c deleted file mode 100644 index 297c0b9db621..000000000000 --- a/fs/ntfs/file.c +++ /dev/null @@ -1,1997 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. - */ - -#include <linux/blkdev.h> -#include <linux/backing-dev.h> -#include <linux/buffer_head.h> -#include <linux/gfp.h> -#include <linux/pagemap.h> -#include <linux/pagevec.h> -#include <linux/sched/signal.h> -#include <linux/swap.h> -#include <linux/uio.h> -#include <linux/writeback.h> - -#include <asm/page.h> -#include <linux/uaccess.h> - -#include "attrib.h" -#include "bitmap.h" -#include "inode.h" -#include "debug.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "ntfs.h" - -/** - * ntfs_file_open - called when an inode is about to be opened - * @vi: inode to be opened - * @filp: file structure describing the inode - * - * Limit file size to the page cache limit on architectures where unsigned long - * is 32-bits. This is the most we can do for now without overflowing the page - * cache page index. Doing it this way means we don't run into problems because - * of existing too large files. It would be better to allow the user to read - * the beginning of the file but I doubt very much anyone is going to hit this - * check on a 32-bit architecture, so there is no point in adding the extra - * complexity required to support this. - * - * On 64-bit architectures, the check is hopefully optimized away by the - * compiler. - * - * After the check passes, just call generic_file_open() to do its work. - */ -static int ntfs_file_open(struct inode *vi, struct file *filp) -{ - if (sizeof(unsigned long) < 8) { - if (i_size_read(vi) > MAX_LFS_FILESIZE) - return -EOVERFLOW; - } - return generic_file_open(vi, filp); -} - -#ifdef NTFS_RW - -/** - * ntfs_attr_extend_initialized - extend the initialized size of an attribute - * @ni: ntfs inode of the attribute to extend - * @new_init_size: requested new initialized size in bytes - * - * Extend the initialized size of an attribute described by the ntfs inode @ni - * to @new_init_size bytes. This involves zeroing any non-sparse space between - * the old initialized size and @new_init_size both in the page cache and on - * disk (if relevant complete pages are already uptodate in the page cache then - * these are simply marked dirty). - * - * As a side-effect, the file size (vfs inode->i_size) may be incremented as, - * in the resident attribute case, it is tied to the initialized size and, in - * the non-resident attribute case, it may not fall below the initialized size. - * - * Note that if the attribute is resident, we do not need to touch the page - * cache at all. This is because if the page cache page is not uptodate we - * bring it uptodate later, when doing the write to the mft record since we - * then already have the page mapped. And if the page is uptodate, the - * non-initialized region will already have been zeroed when the page was - * brought uptodate and the region may in fact already have been overwritten - * with new data via mmap() based writes, so we cannot just zero it. And since - * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped - * is unspecified, we choose not to do zeroing and thus we do not need to touch - * the page at all. For a more detailed explanation see ntfs_truncate() in - * fs/ntfs/inode.c. - * - * Return 0 on success and -errno on error. In the case that an error is - * encountered it is possible that the initialized size will already have been - * incremented some way towards @new_init_size but it is guaranteed that if - * this is the case, the necessary zeroing will also have happened and that all - * metadata is self-consistent. - * - * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be - * held by the caller. - */ -static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) -{ - s64 old_init_size; - loff_t old_i_size; - pgoff_t index, end_index; - unsigned long flags; - struct inode *vi = VFS_I(ni); - ntfs_inode *base_ni; - MFT_RECORD *m = NULL; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx = NULL; - struct address_space *mapping; - struct page *page = NULL; - u8 *kattr; - int err; - u32 attr_len; - - read_lock_irqsave(&ni->size_lock, flags); - old_init_size = ni->initialized_size; - old_i_size = i_size_read(vi); - BUG_ON(new_init_size > ni->allocated_size); - read_unlock_irqrestore(&ni->size_lock, flags); - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " - "old_initialized_size 0x%llx, " - "new_initialized_size 0x%llx, i_size 0x%llx.", - vi->i_ino, (unsigned)le32_to_cpu(ni->type), - (unsigned long long)old_init_size, - (unsigned long long)new_init_size, old_i_size); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Use goto to reduce indentation and we need the label below anyway. */ - if (NInoNonResident(ni)) - goto do_non_resident_extend; - BUG_ON(old_init_size != old_i_size); - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(a->non_resident); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - BUG_ON(old_i_size != (loff_t)attr_len); - /* - * Do the zeroing in the mft record and update the attribute size in - * the mft record. - */ - kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); - memset(kattr + attr_len, 0, new_init_size - attr_len); - a->data.resident.value_length = cpu_to_le32((u32)new_init_size); - /* Finally, update the sizes in the vfs and ntfs inodes. */ - write_lock_irqsave(&ni->size_lock, flags); - i_size_write(vi, new_init_size); - ni->initialized_size = new_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); - goto done; -do_non_resident_extend: - /* - * If the new initialized size @new_init_size exceeds the current file - * size (vfs inode->i_size), we need to extend the file size to the - * new initialized size. - */ - if (new_init_size > old_i_size) { - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(!a->non_resident); - BUG_ON(old_i_size != (loff_t) - sle64_to_cpu(a->data.non_resident.data_size)); - a->data.non_resident.data_size = cpu_to_sle64(new_init_size); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* Update the file size in the vfs inode. */ - i_size_write(vi, new_init_size); - ntfs_attr_put_search_ctx(ctx); - ctx = NULL; - unmap_mft_record(base_ni); - m = NULL; - } - mapping = vi->i_mapping; - index = old_init_size >> PAGE_SHIFT; - end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - do { - /* - * Read the page. If the page is not present, this will zero - * the uninitialized regions for us. - */ - page = read_mapping_page(mapping, index, NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto init_err_out; - } - /* - * Update the initialized size in the ntfs inode. This is - * enough to make ntfs_writepage() work. - */ - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; - if (ni->initialized_size > new_init_size) - ni->initialized_size = new_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); - /* Set the page dirty so it gets written out. */ - set_page_dirty(page); - put_page(page); - /* - * Play nice with the vm and the rest of the system. This is - * very much needed as we can potentially be modifying the - * initialised size from a very small value to a really huge - * value, e.g. - * f = open(somefile, O_TRUNC); - * truncate(f, 10GiB); - * seek(f, 10GiB); - * write(f, 1); - * And this would mean we would be marking dirty hundreds of - * thousands of pages or as in the above example more than - * two and a half million pages! - * - * TODO: For sparse pages could optimize this workload by using - * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This - * would be set in read_folio for sparse pages and here we would - * not need to mark dirty any pages which have this bit set. - * The only caveat is that we have to clear the bit everywhere - * where we allocate any clusters that lie in the page or that - * contain the page. - * - * TODO: An even greater optimization would be for us to only - * call read_folio() on pages which are not in sparse regions as - * determined from the runlist. This would greatly reduce the - * number of pages we read and make dirty in the case of sparse - * files. - */ - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } while (++index < end_index); - read_lock_irqsave(&ni->size_lock, flags); - BUG_ON(ni->initialized_size != new_init_size); - read_unlock_irqrestore(&ni->size_lock, flags); - /* Now bring in sync the initialized_size in the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto init_err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto init_err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto init_err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(!a->non_resident); - a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); -done: - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", - (unsigned long long)new_init_size, i_size_read(vi)); - return 0; -init_err_out: - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = old_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_debug("Failed. Returning error code %i.", err); - return err; -} - -static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, - struct iov_iter *from) -{ - loff_t pos; - s64 end, ll; - ssize_t err; - unsigned long flags; - struct file *file = iocb->ki_filp; - struct inode *vi = file_inode(file); - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " - "0x%llx, count 0x%zx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)iocb->ki_pos, - iov_iter_count(from)); - err = generic_write_checks(iocb, from); - if (unlikely(err <= 0)) - goto out; - /* - * All checks have passed. Before we start doing any writing we want - * to abort any totally illegal writes. - */ - BUG_ON(NInoMstProtected(ni)); - BUG_ON(ni->type != AT_DATA); - /* If file is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - /* Only $DATA attributes can be encrypted. */ - /* - * Reminder for later: Encrypted files are _always_ - * non-resident so that the content can always be encrypted. - */ - ntfs_debug("Denying write access to encrypted file."); - err = -EACCES; - goto out; - } - if (NInoCompressed(ni)) { - /* Only unnamed $DATA attribute can be compressed. */ - BUG_ON(ni->name_len); - /* - * Reminder for later: If resident, the data is not actually - * compressed. Only on the switch to non-resident does - * compression kick in. This is in contrast to encrypted files - * (see above). - */ - ntfs_error(vi->i_sb, "Writing to compressed files is not " - "implemented yet. Sorry."); - err = -EOPNOTSUPP; - goto out; - } - err = file_remove_privs(file); - if (unlikely(err)) - goto out; - /* - * Our ->update_time method always succeeds thus file_update_time() - * cannot fail either so there is no need to check the return code. - */ - file_update_time(file); - pos = iocb->ki_pos; - /* The first byte after the last cluster being written to. */ - end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & - ~(u64)vol->cluster_size_mask; - /* - * If the write goes beyond the allocated size, extend the allocation - * to cover the whole of the write, rounded up to the nearest cluster. - */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (end > ll) { - /* - * Extend the allocation without changing the data size. - * - * Note we ensure the allocation is big enough to at least - * write some data but we do not require the allocation to be - * complete, i.e. it may be partial. - */ - ll = ntfs_attr_extend_allocation(ni, end, -1, pos); - if (likely(ll >= 0)) { - BUG_ON(pos >= ll); - /* If the extension was partial truncate the write. */ - if (end > ll) { - ntfs_debug("Truncating write to inode 0x%lx, " - "attribute type 0x%x, because " - "the allocation was only " - "partially extended.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - iov_iter_truncate(from, ll - pos); - } - } else { - err = ll; - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* Perform a partial write if possible or fail. */ - if (pos < ll) { - ntfs_debug("Truncating write to inode 0x%lx " - "attribute type 0x%x, because " - "extending the allocation " - "failed (error %d).", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type), - (int)-err); - iov_iter_truncate(from, ll - pos); - } else { - if (err != -ENOSPC) - ntfs_error(vi->i_sb, "Cannot perform " - "write to inode " - "0x%lx, attribute " - "type 0x%x, because " - "extending the " - "allocation failed " - "(error %ld).", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type), - (long)-err); - else - ntfs_debug("Cannot perform write to " - "inode 0x%lx, " - "attribute type 0x%x, " - "because there is not " - "space left.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - goto out; - } - } - } - /* - * If the write starts beyond the initialized size, extend it up to the - * beginning of the write and initialize all non-sparse space between - * the old initialized size and the new one. This automatically also - * increments the vfs inode->i_size to keep it above or equal to the - * initialized_size. - */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (pos > ll) { - /* - * Wait for ongoing direct i/o to complete before proceeding. - * New direct i/o cannot start as we hold i_mutex. - */ - inode_dio_wait(vi); - err = ntfs_attr_extend_initialized(ni, pos); - if (unlikely(err < 0)) - ntfs_error(vi->i_sb, "Cannot perform write to inode " - "0x%lx, attribute type 0x%x, because " - "extending the initialized size " - "failed (error %d).", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (int)-err); - } -out: - return err; -} - -/** - * __ntfs_grab_cache_pages - obtain a number of locked pages - * @mapping: address space mapping from which to obtain page cache pages - * @index: starting index in @mapping at which to begin obtaining pages - * @nr_pages: number of page cache pages to obtain - * @pages: array of pages in which to return the obtained page cache pages - * @cached_page: allocated but as yet unused page - * - * Obtain @nr_pages locked page cache pages from the mapping @mapping and - * starting at index @index. - * - * If a page is newly created, add it to lru list - * - * Note, the page locks are obtained in ascending page index order. - */ -static inline int __ntfs_grab_cache_pages(struct address_space *mapping, - pgoff_t index, const unsigned nr_pages, struct page **pages, - struct page **cached_page) -{ - int err, nr; - - BUG_ON(!nr_pages); - err = nr = 0; - do { - pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | - FGP_ACCESSED); - if (!pages[nr]) { - if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); - if (unlikely(!*cached_page)) { - err = -ENOMEM; - goto err_out; - } - } - err = add_to_page_cache_lru(*cached_page, mapping, - index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (unlikely(err)) { - if (err == -EEXIST) - continue; - goto err_out; - } - pages[nr] = *cached_page; - *cached_page = NULL; - } - index++; - nr++; - } while (nr < nr_pages); -out: - return err; -err_out: - while (nr > 0) { - unlock_page(pages[--nr]); - put_page(pages[nr]); - } - goto out; -} - -static inline void ntfs_submit_bh_for_read(struct buffer_head *bh) -{ - lock_buffer(bh); - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, bh); -} - -/** - * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * This is called for non-resident attributes from ntfs_file_buffered_write() - * with i_mutex held on the inode (@pages[0]->mapping->host). There are - * @nr_pages pages in @pages which are locked but not kmap()ped. The source - * data has not yet been copied into the @pages. - * - * Need to fill any holes with actual clusters, allocate buffers if necessary, - * ensure all the buffers are mapped, and bring uptodate any buffers that are - * only partially being written to. - * - * If @nr_pages is greater than one, we are guaranteed that the cluster size is - * greater than PAGE_SIZE, that all pages in @pages are entirely inside - * the same cluster and that they are the entirety of that cluster, and that - * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. - * - * i_size is not to be modified yet. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, - unsigned nr_pages, s64 pos, size_t bytes) -{ - VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; - LCN lcn; - s64 bh_pos, vcn_len, end, initialized_size; - sector_t lcn_block; - struct folio *folio; - struct inode *vi; - ntfs_inode *ni, *base_ni = NULL; - ntfs_volume *vol; - runlist_element *rl, *rl2; - struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *m = NULL; - ATTR_RECORD *a = NULL; - unsigned long flags; - u32 attr_rec_len = 0; - unsigned blocksize, u; - int err, mp_size; - bool rl_write_locked, was_hole, is_retry; - unsigned char blocksize_bits; - struct { - u8 runlist_merged:1; - u8 mft_attr_mapped:1; - u8 mp_rebuilt:1; - u8 attr_switched:1; - } status = { 0, 0, 0, 0 }; - - BUG_ON(!nr_pages); - BUG_ON(!pages); - BUG_ON(!*pages); - vi = pages[0]->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " - "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", - vi->i_ino, ni->type, pages[0]->index, nr_pages, - (long long)pos, bytes); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - rl_write_locked = false; - rl = NULL; - err = 0; - vcn = lcn = -1; - vcn_len = 0; - lcn_block = -1; - was_hole = false; - cpos = pos >> vol->cluster_size_bits; - end = pos + bytes; - cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; - /* - * Loop over each buffer in each folio. Use goto to - * reduce indentation. - */ - u = 0; -do_next_folio: - folio = page_folio(pages[u]); - bh_pos = folio_pos(folio); - head = folio_buffers(folio); - if (!head) - /* - * create_empty_buffers() will create uptodate/dirty - * buffers if the folio is uptodate/dirty. - */ - head = create_empty_buffers(folio, blocksize, 0); - bh = head; - do { - VCN cdelta; - s64 bh_end; - unsigned bh_cofs; - - /* Clear buffer_new on all buffers to reinitialise state. */ - if (buffer_new(bh)) - clear_buffer_new(bh); - bh_end = bh_pos + blocksize; - bh_cpos = bh_pos >> vol->cluster_size_bits; - bh_cofs = bh_pos & vol->cluster_size_mask; - if (buffer_mapped(bh)) { - /* - * The buffer is already mapped. If it is uptodate, - * ignore it. - */ - if (buffer_uptodate(bh)) - continue; - /* - * The buffer is not uptodate. If the folio is uptodate - * set the buffer uptodate and otherwise ignore it. - */ - if (folio_test_uptodate(folio)) { - set_buffer_uptodate(bh); - continue; - } - /* - * Neither the folio nor the buffer are uptodate. If - * the buffer is only partially being written to, we - * need to read it in before the write, i.e. now. - */ - if ((bh_pos < pos && bh_end > pos) || - (bh_pos < end && bh_end > end)) { - /* - * If the buffer is fully or partially within - * the initialized size, do an actual read. - * Otherwise, simply zero the buffer. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (bh_pos < initialized_size) { - ntfs_submit_bh_for_read(bh); - *wait_bh++ = bh; - } else { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - continue; - } - /* Unmapped buffer. Need to map it. */ - bh->b_bdev = vol->sb->s_bdev; - /* - * If the current buffer is in the same clusters as the map - * cache, there is no need to check the runlist again. The - * map cache is made up of @vcn, which is the first cached file - * cluster, @vcn_len which is the number of cached file - * clusters, @lcn is the device cluster corresponding to @vcn, - * and @lcn_block is the block number corresponding to @lcn. - */ - cdelta = bh_cpos - vcn; - if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { -map_buffer_cached: - BUG_ON(lcn < 0); - bh->b_blocknr = lcn_block + - (cdelta << (vol->cluster_size_bits - - blocksize_bits)) + - (bh_cofs >> blocksize_bits); - set_buffer_mapped(bh); - /* - * If the folio is uptodate so is the buffer. If the - * buffer is fully outside the write, we ignore it if - * it was already allocated and we mark it dirty so it - * gets written out if we allocated it. On the other - * hand, if we allocated the buffer but we are not - * marking it dirty we set buffer_new so we can do - * error recovery. - */ - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - if (unlikely(was_hole)) { - /* We allocated the buffer. */ - clean_bdev_bh_alias(bh); - if (bh_end <= pos || bh_pos >= end) - mark_buffer_dirty(bh); - else - set_buffer_new(bh); - } - continue; - } - /* Page is _not_ uptodate. */ - if (likely(!was_hole)) { - /* - * Buffer was already allocated. If it is not - * uptodate and is only partially being written - * to, we need to read it in before the write, - * i.e. now. - */ - if (!buffer_uptodate(bh) && bh_pos < end && - bh_end > pos && - (bh_pos < pos || - bh_end > end)) { - /* - * If the buffer is fully or partially - * within the initialized size, do an - * actual read. Otherwise, simply zero - * the buffer. - */ - read_lock_irqsave(&ni->size_lock, - flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, - flags); - if (bh_pos < initialized_size) { - ntfs_submit_bh_for_read(bh); - *wait_bh++ = bh; - } else { - folio_zero_range(folio, - bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - continue; - } - /* We allocated the buffer. */ - clean_bdev_bh_alias(bh); - /* - * If the buffer is fully outside the write, zero it, - * set it uptodate, and mark it dirty so it gets - * written out. If it is partially being written to, - * zero region surrounding the write but leave it to - * commit write to do anything else. Finally, if the - * buffer is fully being overwritten, do nothing. - */ - if (bh_end <= pos || bh_pos >= end) { - if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - mark_buffer_dirty(bh); - continue; - } - set_buffer_new(bh); - if (!buffer_uptodate(bh) && - (bh_pos < pos || bh_end > end)) { - u8 *kaddr; - unsigned pofs; - - kaddr = kmap_local_folio(folio, 0); - if (bh_pos < pos) { - pofs = bh_pos & ~PAGE_MASK; - memset(kaddr + pofs, 0, pos - bh_pos); - } - if (bh_end > end) { - pofs = end & ~PAGE_MASK; - memset(kaddr + pofs, 0, bh_end - end); - } - kunmap_local(kaddr); - flush_dcache_folio(folio); - } - continue; - } - /* - * Slow path: this is the first buffer in the cluster. If it - * is outside allocated size and is not uptodate, zero it and - * set it uptodate. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (bh_pos > initialized_size) { - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - } else if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - continue; - } - is_retry = false; - if (!rl) { - down_read(&ni->runlist.lock); -retry_remap: - rl = ni->runlist.rl; - } - if (likely(rl != NULL)) { - /* Seek to element containing target cluster. */ - while (rl->length && rl[1].vcn <= bh_cpos) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); - if (likely(lcn >= 0)) { - /* - * Successful remap, setup the map cache and - * use that to deal with the buffer. - */ - was_hole = false; - vcn = bh_cpos; - vcn_len = rl[1].vcn - vcn; - lcn_block = lcn << (vol->cluster_size_bits - - blocksize_bits); - cdelta = 0; - /* - * If the number of remaining clusters touched - * by the write is smaller or equal to the - * number of cached clusters, unlock the - * runlist as the map cache will be used from - * now on. - */ - if (likely(vcn + vcn_len >= cend)) { - if (rl_write_locked) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - } else - up_read(&ni->runlist.lock); - rl = NULL; - } - goto map_buffer_cached; - } - } else - lcn = LCN_RL_NOT_MAPPED; - /* - * If it is not a hole and not out of bounds, the runlist is - * probably unmapped so try to map it now. - */ - if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { - if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { - /* Attempt to map runlist. */ - if (!rl_write_locked) { - /* - * We need the runlist locked for - * writing, so if it is locked for - * reading relock it now and retry in - * case it changed whilst we dropped - * the lock. - */ - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - rl_write_locked = true; - goto retry_remap; - } - err = ntfs_map_runlist_nolock(ni, bh_cpos, - NULL); - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - /* - * If @vcn is out of bounds, pretend @lcn is - * LCN_ENOENT. As long as the buffer is out - * of bounds this will work fine. - */ - if (err == -ENOENT) { - lcn = LCN_ENOENT; - err = 0; - goto rl_not_mapped_enoent; - } - } else - err = -EIO; - /* Failed to map the buffer, even after retrying. */ - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, " - "vcn offset 0x%x, because its " - "location on disk could not be " - "determined%s (error code %i).", - ni->mft_no, ni->type, - (unsigned long long)bh_cpos, - (unsigned)bh_pos & - vol->cluster_size_mask, - is_retry ? " even after retrying" : "", - err); - break; - } -rl_not_mapped_enoent: - /* - * The buffer is in a hole or out of bounds. We need to fill - * the hole, unless the buffer is in a cluster which is not - * touched by the write, in which case we just leave the buffer - * unmapped. This can only happen when the cluster size is - * less than the page cache size. - */ - if (unlikely(vol->cluster_size < PAGE_SIZE)) { - bh_cend = (bh_end + vol->cluster_size - 1) >> - vol->cluster_size_bits; - if ((bh_cend <= cpos || bh_cpos >= cend)) { - bh->b_blocknr = -1; - /* - * If the buffer is uptodate we skip it. If it - * is not but the folio is uptodate, we can set - * the buffer uptodate. If the folio is not - * uptodate, we can clear the buffer and set it - * uptodate. Whether this is worthwhile is - * debatable and this could be removed. - */ - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - } else if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - continue; - } - } - /* - * Out of bounds buffer is invalid if it was not really out of - * bounds. - */ - BUG_ON(lcn != LCN_HOLE); - /* - * We need the runlist locked for writing, so if it is locked - * for reading relock it now and retry in case it changed - * whilst we dropped the lock. - */ - BUG_ON(!rl); - if (!rl_write_locked) { - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - rl_write_locked = true; - goto retry_remap; - } - /* Find the previous last allocated cluster. */ - BUG_ON(rl->lcn != LCN_HOLE); - lcn = -1; - rl2 = rl; - while (--rl2 >= ni->runlist.rl) { - if (rl2->lcn >= 0) { - lcn = rl2->lcn + rl2->length; - break; - } - } - rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, - false); - if (IS_ERR(rl2)) { - err = PTR_ERR(rl2); - ntfs_debug("Failed to allocate cluster, error code %i.", - err); - break; - } - lcn = rl2->lcn; - rl = ntfs_runlists_merge(ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (err != -ENOMEM) - err = -EIO; - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to release " - "allocated cluster in error " - "code path. Run chkdsk to " - "recover the lost cluster."); - NVolSetErrors(vol); - } - ntfs_free(rl2); - break; - } - ni->runlist.rl = rl; - status.runlist_merged = 1; - ntfs_debug("Allocated cluster, lcn 0x%llx.", - (unsigned long long)lcn); - /* Map and lock the mft record and get the attribute record. */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - break; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - unmap_mft_record(base_ni); - break; - } - status.mft_attr_mapped = 1; - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - break; - } - m = ctx->mrec; - a = ctx->attr; - /* - * Find the runlist element with which the attribute extent - * starts. Note, we cannot use the _attr_ version because we - * have mapped the mft record. That is ok because we know the - * runlist fragment must be mapped already to have ever gotten - * here, so we can just use the _rl_ version. - */ - vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); - rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); - BUG_ON(!rl2); - BUG_ON(!rl2->length); - BUG_ON(rl2->lcn < LCN_HOLE); - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - /* - * If @highest_vcn is zero, calculate the real highest_vcn - * (which can really be zero). - */ - if (!highest_vcn) - highest_vcn = (sle64_to_cpu( - a->data.non_resident.allocated_size) >> - vol->cluster_size_bits) - 1; - /* - * Determine the size of the mapping pairs array for the new - * extent, i.e. the old extent with the hole filled. - */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, - highest_vcn); - if (unlikely(mp_size <= 0)) { - if (!(err = mp_size)) - err = -EIO; - ntfs_debug("Failed to get size for mapping pairs " - "array, error code %i.", err); - break; - } - /* - * Resize the attribute record to fit the new mapping pairs - * array. - */ - attr_rec_len = le32_to_cpu(a->length); - err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)); - if (unlikely(err)) { - BUG_ON(err != -ENOSPC); - // TODO: Deal with this by using the current attribute - // and fill it with as much of the mapping pairs - // array as possible. Then loop over each attribute - // extent rewriting the mapping pairs arrays as we go - // along and if when we reach the end we have not - // enough space, try to resize the last attribute - // extent and if even that fails, add a new attribute - // extent. - // We could also try to resize at each step in the hope - // that we will not need to rewrite every single extent. - // Note, we may need to decompress some extents to fill - // the runlist as we are walking the extents... - ntfs_error(vol->sb, "Not enough space in the mft " - "record for the extended attribute " - "record. This case is not " - "implemented yet."); - err = -EOPNOTSUPP; - break ; - } - status.mp_rebuilt = 1; - /* - * Generate the mapping pairs array directly into the attribute - * record. - */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, vcn, highest_vcn, NULL); - if (unlikely(err)) { - ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " - "attribute type 0x%x, because building " - "the mapping pairs failed with error " - "code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - break; - } - /* Update the highest_vcn but only if it was not set. */ - if (unlikely(!a->data.non_resident.highest_vcn)) - a->data.non_resident.highest_vcn = - cpu_to_sle64(highest_vcn); - /* - * If the attribute is sparse/compressed, update the compressed - * size in the ntfs_inode structure and the attribute record. - */ - if (likely(NInoSparse(ni) || NInoCompressed(ni))) { - /* - * If we are not in the first attribute extent, switch - * to it, but first ensure the changes will make it to - * disk later. - */ - if (a->data.non_resident.lowest_vcn) { - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(ni->type, ni->name, - ni->name_len, CASE_SENSITIVE, - 0, NULL, 0, ctx); - if (unlikely(err)) { - status.attr_switched = 1; - break; - } - /* @m is not used any more so do not set it. */ - a = ctx->attr; - } - write_lock_irqsave(&ni->size_lock, flags); - ni->itype.compressed.size += vol->cluster_size; - a->data.non_resident.compressed_size = - cpu_to_sle64(ni->itype.compressed.size); - write_unlock_irqrestore(&ni->size_lock, flags); - } - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - /* Successfully filled the hole. */ - status.runlist_merged = 0; - status.mft_attr_mapped = 0; - status.mp_rebuilt = 0; - /* Setup the map cache and use that to deal with the buffer. */ - was_hole = true; - vcn = bh_cpos; - vcn_len = 1; - lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); - cdelta = 0; - /* - * If the number of remaining clusters in the @pages is smaller - * or equal to the number of cached clusters, unlock the - * runlist as the map cache will be used from now on. - */ - if (likely(vcn + vcn_len >= cend)) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - rl = NULL; - } - goto map_buffer_cached; - } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); - /* If there are no errors, do the next page. */ - if (likely(!err && ++u < nr_pages)) - goto do_next_folio; - /* If there are no errors, release the runlist lock if we took it. */ - if (likely(!err)) { - if (unlikely(rl_write_locked)) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - } else if (unlikely(rl)) - up_read(&ni->runlist.lock); - rl = NULL; - } - /* If we issued read requests, let them complete. */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - while (wait_bh > wait) { - bh = *--wait_bh; - wait_on_buffer(bh); - if (likely(buffer_uptodate(bh))) { - folio = bh->b_folio; - bh_pos = folio_pos(folio) + bh_offset(bh); - /* - * If the buffer overflows the initialized size, need - * to zero the overflowing region. - */ - if (unlikely(bh_pos + blocksize > initialized_size)) { - int ofs = 0; - - if (likely(bh_pos < initialized_size)) - ofs = initialized_size - bh_pos; - folio_zero_segment(folio, bh_offset(bh) + ofs, - blocksize); - } - } else /* if (unlikely(!buffer_uptodate(bh))) */ - err = -EIO; - } - if (likely(!err)) { - /* Clear buffer_new on all buffers. */ - u = 0; - do { - bh = head = page_buffers(pages[u]); - do { - if (buffer_new(bh)) - clear_buffer_new(bh); - } while ((bh = bh->b_this_page) != head); - } while (++u < nr_pages); - ntfs_debug("Done."); - return err; - } - if (status.attr_switched) { - /* Get back to the attribute extent we modified. */ - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find required " - "attribute extent of attribute in " - "error code path. Run chkdsk to " - "recover."); - write_lock_irqsave(&ni->size_lock, flags); - ni->itype.compressed.size += vol->cluster_size; - write_unlock_irqrestore(&ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* - * The only thing that is now wrong is the compressed - * size of the base attribute extent which chkdsk - * should be able to fix. - */ - NVolSetErrors(vol); - } else { - m = ctx->mrec; - a = ctx->attr; - status.attr_switched = 0; - } - } - /* - * If the runlist has been modified, need to restore it by punching a - * hole into it and we then need to deallocate the on-disk cluster as - * well. Note, we only modify the runlist if we are able to generate a - * new mapping pairs array, i.e. only when the mapped attribute extent - * is not switched. - */ - if (status.runlist_merged && !status.attr_switched) { - BUG_ON(!rl_write_locked); - /* Make the file cluster we allocated sparse in the runlist. */ - if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { - ntfs_error(vol->sb, "Failed to punch hole into " - "attribute runlist in error code " - "path. Run chkdsk to recover the " - "lost cluster."); - NVolSetErrors(vol); - } else /* if (success) */ { - status.runlist_merged = 0; - /* - * Deallocate the on-disk cluster we allocated but only - * if we succeeded in punching its vcn out of the - * runlist. - */ - down_write(&vol->lcnbmp_lock); - if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { - ntfs_error(vol->sb, "Failed to release " - "allocated cluster in error " - "code path. Run chkdsk to " - "recover the lost cluster."); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); - } - } - /* - * Resize the attribute record to its old size and rebuild the mapping - * pairs array. Note, we only can do this if the runlist has been - * restored to its old state which also implies that the mapped - * attribute extent is not switched. - */ - if (status.mp_rebuilt && !status.runlist_merged) { - if (ntfs_attr_record_resize(m, a, attr_rec_len)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record in error code path. Run " - "chkdsk to recover."); - NVolSetErrors(vol); - } else /* if (success) */ { - if (ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), attr_rec_len - - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), ni->runlist.rl, - vcn, highest_vcn, NULL)) { - ntfs_error(vol->sb, "Failed to restore " - "mapping pairs array in error " - "code path. Run chkdsk to " - "recover."); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } - } - /* Release the mft record and the attribute. */ - if (status.mft_attr_mapped) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - } - /* Release the runlist lock. */ - if (rl_write_locked) - up_write(&ni->runlist.lock); - else if (rl) - up_read(&ni->runlist.lock); - /* - * Zero out any newly allocated blocks to avoid exposing stale data. - * If BH_New is set, we know that the block was newly allocated above - * and that it has not been fully zeroed and marked dirty yet. - */ - nr_pages = u; - u = 0; - end = bh_cpos << vol->cluster_size_bits; - do { - folio = page_folio(pages[u]); - bh = head = folio_buffers(folio); - do { - if (u == nr_pages && - folio_pos(folio) + bh_offset(bh) >= end) - break; - if (!buffer_new(bh)) - continue; - clear_buffer_new(bh); - if (!buffer_uptodate(bh)) { - if (folio_test_uptodate(folio)) - set_buffer_uptodate(bh); - else { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - mark_buffer_dirty(bh); - } while ((bh = bh->b_this_page) != head); - } while (++u <= nr_pages); - ntfs_error(vol->sb, "Failed. Returning error code %i.", err); - return err; -} - -static inline void ntfs_flush_dcache_pages(struct page **pages, - unsigned nr_pages) -{ - BUG_ON(!nr_pages); - /* - * Warning: Do not do the decrement at the same time as the call to - * flush_dcache_page() because it is a NULL macro on i386 and hence the - * decrement never happens so the loop never terminates. - */ - do { - --nr_pages; - flush_dcache_page(pages[nr_pages]); - } while (nr_pages > 0); -} - -/** - * ntfs_commit_pages_after_non_resident_write - commit the received data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * See description of ntfs_commit_pages_after_write(), below. - */ -static inline int ntfs_commit_pages_after_non_resident_write( - struct page **pages, const unsigned nr_pages, - s64 pos, size_t bytes) -{ - s64 end, initialized_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - struct buffer_head *bh, *head; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - unsigned long flags; - unsigned blocksize, u; - int err; - - vi = pages[0]->mapping->host; - ni = NTFS_I(vi); - blocksize = vi->i_sb->s_blocksize; - end = pos + bytes; - u = 0; - do { - s64 bh_pos; - struct page *page; - bool partial; - - page = pages[u]; - bh_pos = (s64)page->index << PAGE_SHIFT; - bh = head = page_buffers(page); - partial = false; - do { - s64 bh_end; - - bh_end = bh_pos + blocksize; - if (bh_end <= pos || bh_pos >= end) { - if (!buffer_uptodate(bh)) - partial = true; - } else { - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - } - } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); - /* - * If all buffers are now uptodate but the page is not, set the - * page uptodate. - */ - if (!partial && !PageUptodate(page)) - SetPageUptodate(page); - } while (++u < nr_pages); - /* - * Finally, if we do not need to update initialized_size or i_size we - * are finished. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (end <= initialized_size) { - ntfs_debug("Done."); - return 0; - } - /* - * Update initialized_size/i_size as appropriate, both in the inode and - * the mft record. - */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - BUG_ON(!NInoNonResident(ni)); - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - a = ctx->attr; - BUG_ON(!a->non_resident); - write_lock_irqsave(&ni->size_lock, flags); - BUG_ON(end > ni->allocated_size); - ni->initialized_size = end; - a->data.non_resident.initialized_size = cpu_to_sle64(end); - if (end > i_size_read(vi)) { - i_size_write(vi, end); - a->data.non_resident.data_size = - a->data.non_resident.initialized_size; - } - write_unlock_irqrestore(&ni->size_lock, flags); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - ntfs_debug("Done."); - return 0; -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " - "code %i).", err); - if (err != -ENOMEM) - NVolSetErrors(ni->vol); - return err; -} - -/** - * ntfs_commit_pages_after_write - commit the received data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * This is called from ntfs_file_buffered_write() with i_mutex held on the inode - * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are - * locked but not kmap()ped. The source data has already been copied into the - * @page. ntfs_prepare_pages_for_non_resident_write() has been called before - * the data was copied (for non-resident attributes only) and it returned - * success. - * - * Need to set uptodate and mark dirty all buffers within the boundary of the - * write. If all buffers in a page are uptodate we set the page uptodate, too. - * - * Setting the buffers dirty ensures that they get written out later when - * ntfs_writepage() is invoked by the VM. - * - * Finally, we need to update i_size and initialized_size as appropriate both - * in the inode and the mft record. - * - * This is modelled after fs/buffer.c::generic_commit_write(), which marks - * buffers uptodate and dirty, sets the page uptodate if all buffers in the - * page are uptodate, and updates i_size if the end of io is beyond i_size. In - * that case, it also marks the inode dirty. - * - * If things have gone as outlined in - * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page - * content modifications here for non-resident attributes. For resident - * attributes we need to do the uptodate bringing here which we combine with - * the copying into the mft record which means we save one atomic kmap. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_commit_pages_after_write(struct page **pages, - const unsigned nr_pages, s64 pos, size_t bytes) -{ - s64 end, initialized_size; - loff_t i_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - struct page *page; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - char *kattr, *kaddr; - unsigned long flags; - u32 attr_len; - int err; - - BUG_ON(!nr_pages); - BUG_ON(!pages); - page = pages[0]; - BUG_ON(!page); - vi = page->mapping->host; - ni = NTFS_I(vi); - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " - "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", - vi->i_ino, ni->type, page->index, nr_pages, - (long long)pos, bytes); - if (NInoNonResident(ni)) - return ntfs_commit_pages_after_non_resident_write(pages, - nr_pages, pos, bytes); - BUG_ON(nr_pages > 1); - /* - * Attribute is resident, implying it is not compressed, encrypted, or - * sparse. - */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - BUG_ON(NInoNonResident(ni)); - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - a = ctx->attr; - BUG_ON(a->non_resident); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - i_size = i_size_read(vi); - BUG_ON(attr_len != i_size); - BUG_ON(pos > attr_len); - end = pos + bytes; - BUG_ON(end > le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset)); - kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); - kaddr = kmap_atomic(page); - /* Copy the received data from the page to the mft record. */ - memcpy(kattr + pos, kaddr + pos, bytes); - /* Update the attribute length if necessary. */ - if (end > attr_len) { - attr_len = end; - a->data.resident.value_length = cpu_to_le32(attr_len); - } - /* - * If the page is not uptodate, bring the out of bounds area(s) - * uptodate by copying data from the mft record to the page. - */ - if (!PageUptodate(page)) { - if (pos > 0) - memcpy(kaddr, kattr, pos); - if (end < attr_len) - memcpy(kaddr + end, kattr + end, attr_len - end); - /* Zero the region outside the end of the attribute value. */ - memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); - flush_dcache_page(page); - SetPageUptodate(page); - } - kunmap_atomic(kaddr); - /* Update initialized_size/i_size if necessary. */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - BUG_ON(end > ni->allocated_size); - read_unlock_irqrestore(&ni->size_lock, flags); - BUG_ON(initialized_size != i_size); - if (end > initialized_size) { - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = end; - i_size_write(vi, end); - write_unlock_irqrestore(&ni->size_lock, flags); - } - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - ntfs_debug("Done."); - return 0; -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Error allocating memory required to " - "commit the write."); - if (PageUptodate(page)) { - ntfs_warning(vi->i_sb, "Page is uptodate, setting " - "dirty so the write will be retried " - "later on by the VM."); - /* - * Put the page on mapping->dirty_pages, but leave its - * buffers' dirty state as-is. - */ - __set_page_dirty_nobuffers(page); - err = 0; - } else - ntfs_error(vi->i_sb, "Page is not uptodate. Written " - "data has been lost."); - } else { - ntfs_error(vi->i_sb, "Resident attribute commit write failed " - "with error %i.", err); - NVolSetErrors(ni->vol); - } - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - return err; -} - -/* - * Copy as much as we can into the pages and return the number of bytes which - * were successfully copied. If a fault is encountered then clear the pages - * out to (ofs + bytes) and return the number of bytes which were copied. - */ -static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, - unsigned ofs, struct iov_iter *i, size_t bytes) -{ - struct page **last_page = pages + nr_pages; - size_t total = 0; - unsigned len, copied; - - do { - len = PAGE_SIZE - ofs; - if (len > bytes) - len = bytes; - copied = copy_page_from_iter_atomic(*pages, ofs, len, i); - total += copied; - bytes -= copied; - if (!bytes) - break; - if (copied < len) - goto err; - ofs = 0; - } while (++pages < last_page); -out: - return total; -err: - /* Zero the rest of the target like __copy_from_user(). */ - len = PAGE_SIZE - copied; - do { - if (len > bytes) - len = bytes; - zero_user(*pages, copied, len); - bytes -= len; - copied = 0; - len = PAGE_SIZE; - } while (++pages < last_page); - goto out; -} - -/** - * ntfs_perform_write - perform buffered write to a file - * @file: file to write to - * @i: iov_iter with data to write - * @pos: byte offset in file at which to begin writing to - */ -static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, - loff_t pos) -{ - struct address_space *mapping = file->f_mapping; - struct inode *vi = mapping->host; - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; - struct page *cached_page = NULL; - VCN last_vcn; - LCN lcn; - size_t bytes; - ssize_t status, written = 0; - unsigned nr_pages; - - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " - "0x%llx, count 0x%lx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)pos, - (unsigned long)iov_iter_count(i)); - /* - * If a previous ntfs_truncate() failed, repeat it and abort if it - * fails again. - */ - if (unlikely(NInoTruncateFailed(ni))) { - int err; - - inode_dio_wait(vi); - err = ntfs_truncate(vi); - if (err || NInoTruncateFailed(ni)) { - if (!err) - err = -EIO; - ntfs_error(vol->sb, "Cannot perform write to inode " - "0x%lx, attribute type 0x%x, because " - "ntfs_truncate() failed (error code " - "%i).", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - return err; - } - } - /* - * Determine the number of pages per cluster for non-resident - * attributes. - */ - nr_pages = 1; - if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) - nr_pages = vol->cluster_size >> PAGE_SHIFT; - last_vcn = -1; - do { - VCN vcn; - pgoff_t start_idx; - unsigned ofs, do_pages, u; - size_t copied; - - start_idx = pos >> PAGE_SHIFT; - ofs = pos & ~PAGE_MASK; - bytes = PAGE_SIZE - ofs; - do_pages = 1; - if (nr_pages > 1) { - vcn = pos >> vol->cluster_size_bits; - if (vcn != last_vcn) { - last_vcn = vcn; - /* - * Get the lcn of the vcn the write is in. If - * it is a hole, need to lock down all pages in - * the cluster. - */ - down_read(&ni->runlist.lock); - lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> - vol->cluster_size_bits, false); - up_read(&ni->runlist.lock); - if (unlikely(lcn < LCN_HOLE)) { - if (lcn == LCN_ENOMEM) - status = -ENOMEM; - else { - status = -EIO; - ntfs_error(vol->sb, "Cannot " - "perform write to " - "inode 0x%lx, " - "attribute type 0x%x, " - "because the attribute " - "is corrupt.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } - break; - } - if (lcn == LCN_HOLE) { - start_idx = (pos & ~(s64) - vol->cluster_size_mask) - >> PAGE_SHIFT; - bytes = vol->cluster_size - (pos & - vol->cluster_size_mask); - do_pages = nr_pages; - } - } - } - if (bytes > iov_iter_count(i)) - bytes = iov_iter_count(i); -again: - /* - * Bring in the user page(s) that we will copy from _first_. - * Otherwise there is a nasty deadlock on copying from the same - * page(s) as we are writing to, without it/them being marked - * up-to-date. Note, at present there is nothing to stop the - * pages being swapped out between us bringing them into memory - * and doing the actual copying. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes))) { - status = -EFAULT; - break; - } - /* Get and lock @do_pages starting at index @start_idx. */ - status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, - pages, &cached_page); - if (unlikely(status)) - break; - /* - * For non-resident attributes, we need to fill any holes with - * actual clusters and ensure all bufferes are mapped. We also - * need to bring uptodate any buffers that are only partially - * being written to. - */ - if (NInoNonResident(ni)) { - status = ntfs_prepare_pages_for_non_resident_write( - pages, do_pages, pos, bytes); - if (unlikely(status)) { - do { - unlock_page(pages[--do_pages]); - put_page(pages[do_pages]); - } while (do_pages); - break; - } - } - u = (pos >> PAGE_SHIFT) - pages[0]->index; - copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, - i, bytes); - ntfs_flush_dcache_pages(pages + u, do_pages - u); - status = 0; - if (likely(copied == bytes)) { - status = ntfs_commit_pages_after_write(pages, do_pages, - pos, bytes); - } - do { - unlock_page(pages[--do_pages]); - put_page(pages[do_pages]); - } while (do_pages); - if (unlikely(status < 0)) { - iov_iter_revert(i, copied); - break; - } - cond_resched(); - if (unlikely(copied < bytes)) { - iov_iter_revert(i, copied); - if (copied) - bytes = copied; - else if (bytes > PAGE_SIZE - ofs) - bytes = PAGE_SIZE - ofs; - goto again; - } - pos += copied; - written += copied; - balance_dirty_pages_ratelimited(mapping); - if (fatal_signal_pending(current)) { - status = -EINTR; - break; - } - } while (iov_iter_count(i)); - if (cached_page) - put_page(cached_page); - ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", - written ? "written" : "status", (unsigned long)written, - (long)status); - return written ? written : status; -} - -/** - * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() - * @iocb: IO state structure - * @from: iov_iter with data to write - * - * Basically the same as generic_file_write_iter() except that it ends up - * up calling ntfs_perform_write() instead of generic_perform_write() and that - * O_DIRECT is not implemented. - */ -static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *vi = file_inode(file); - ssize_t written = 0; - ssize_t err; - - inode_lock(vi); - /* We can write back this queue in page reclaim. */ - err = ntfs_prepare_file_for_write(iocb, from); - if (iov_iter_count(from) && !err) - written = ntfs_perform_write(file, from, iocb->ki_pos); - inode_unlock(vi); - iocb->ki_pos += written; - if (likely(written > 0)) - written = generic_write_sync(iocb, written); - return written ? written : err; -} - -/** - * ntfs_file_fsync - sync a file to disk - * @filp: file to be synced - * @datasync: if non-zero only flush user data and not metadata - * - * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync - * system calls. This function is inspired by fs/buffer.c::file_fsync(). - * - * If @datasync is false, write the mft record and all associated extent mft - * records as well as the $DATA attribute and then sync the block device. - * - * If @datasync is true and the attribute is non-resident, we skip the writing - * of the mft record and all associated extent mft records (this might still - * happen due to the write_inode_now() call). - * - * Also, if @datasync is true, we do not wait on the inode to be written out - * but we always wait on the page cache pages to be written out. - * - * Locking: Caller must hold i_mutex on the inode. - * - * TODO: We should probably also write all attribute/index inodes associated - * with this inode but since we have no simple way of getting to them we ignore - * this problem for now. - */ -static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *vi = filp->f_mapping->host; - int err, ret = 0; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - - err = file_write_and_wait_range(filp, start, end); - if (err) - return err; - inode_lock(vi); - - BUG_ON(S_ISDIR(vi->i_mode)); - if (!datasync || !NInoNonResident(NTFS_I(vi))) - ret = __ntfs_write_inode(vi, 1); - write_inode_now(vi, !datasync); - /* - * NOTE: If we were to use mapping->private_list (see ext2 and - * fs/buffer.c) for dirty blocks then we could optimize the below to be - * sync_mapping_buffers(vi->i_mapping). - */ - err = sync_blockdev(vi->i_sb->s_bdev); - if (unlikely(err && !ret)) - ret = err; - if (likely(!ret)) - ntfs_debug("Done."); - else - ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " - "%u.", datasync ? "data" : "", vi->i_ino, -ret); - inode_unlock(vi); - return ret; -} - -#endif /* NTFS_RW */ - -const struct file_operations ntfs_file_ops = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, -#ifdef NTFS_RW - .write_iter = ntfs_file_write_iter, - .fsync = ntfs_file_fsync, -#endif /* NTFS_RW */ - .mmap = generic_file_mmap, - .open = ntfs_file_open, - .splice_read = filemap_splice_read, -}; - -const struct inode_operations ntfs_file_inode_ops = { -#ifdef NTFS_RW - .setattr = ntfs_setattr, -#endif /* NTFS_RW */ -}; - -const struct file_operations ntfs_empty_file_ops = {}; - -const struct inode_operations ntfs_empty_inode_ops = {}; diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c deleted file mode 100644 index d46c2c03a032..000000000000 --- a/fs/ntfs/index.c +++ /dev/null @@ -1,440 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#include <linux/slab.h> - -#include "aops.h" -#include "collate.h" -#include "debug.h" -#include "index.h" -#include "ntfs.h" - -/** - * ntfs_index_ctx_get - allocate and initialize a new index context - * @idx_ni: ntfs index inode with which to initialize the context - * - * Allocate a new index context, initialize it with @idx_ni and return it. - * Return NULL if allocation failed. - * - * Locking: Caller must hold i_mutex on the index inode. - */ -ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) -{ - ntfs_index_context *ictx; - - ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); - if (ictx) - *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; - return ictx; -} - -/** - * ntfs_index_ctx_put - release an index context - * @ictx: index context to free - * - * Release the index context @ictx, releasing all associated resources. - * - * Locking: Caller must hold i_mutex on the index inode. - */ -void ntfs_index_ctx_put(ntfs_index_context *ictx) -{ - if (ictx->entry) { - if (ictx->is_in_root) { - if (ictx->actx) - ntfs_attr_put_search_ctx(ictx->actx); - if (ictx->base_ni) - unmap_mft_record(ictx->base_ni); - } else { - struct page *page = ictx->page; - if (page) { - BUG_ON(!PageLocked(page)); - unlock_page(page); - ntfs_unmap_page(page); - } - } - } - kmem_cache_free(ntfs_index_ctx_cache, ictx); - return; -} - -/** - * ntfs_index_lookup - find a key in an index and return its index entry - * @key: [IN] key for which to search in the index - * @key_len: [IN] length of @key in bytes - * @ictx: [IN/OUT] context describing the index and the returned entry - * - * Before calling ntfs_index_lookup(), @ictx must have been obtained from a - * call to ntfs_index_ctx_get(). - * - * Look for the @key in the index specified by the index lookup context @ictx. - * ntfs_index_lookup() walks the contents of the index looking for the @key. - * - * If the @key is found in the index, 0 is returned and @ictx is setup to - * describe the index entry containing the matching @key. @ictx->entry is the - * index entry and @ictx->data and @ictx->data_len are the index entry data and - * its length in bytes, respectively. - * - * If the @key is not found in the index, -ENOENT is returned and @ictx is - * setup to describe the index entry whose key collates immediately after the - * search @key, i.e. this is the position in the index at which an index entry - * with a key of @key would need to be inserted. - * - * If an error occurs return the negative error code and @ictx is left - * untouched. - * - * When finished with the entry and its data, call ntfs_index_ctx_put() to free - * the context and other associated resources. - * - * If the index entry was modified, call flush_dcache_index_entry_page() - * immediately after the modification and either ntfs_index_entry_mark_dirty() - * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to - * ensure that the changes are written to disk. - * - * Locking: - Caller must hold i_mutex on the index inode. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -int ntfs_index_lookup(const void *key, const int key_len, - ntfs_index_context *ictx) -{ - VCN vcn, old_vcn; - ntfs_inode *idx_ni = ictx->idx_ni; - ntfs_volume *vol = idx_ni->vol; - struct super_block *sb = vol->sb; - ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end, *kaddr; - ntfs_attr_search_ctx *actx; - struct address_space *ia_mapping; - struct page *page; - int rc, err = 0; - - ntfs_debug("Entering."); - BUG_ON(!NInoAttr(idx_ni)); - BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION); - BUG_ON(idx_ni->nr_extents != -1); - BUG_ON(!base_ni); - BUG_ON(!key); - BUG_ON(key_len <= 0); - if (!ntfs_is_collation_rule_supported( - idx_ni->itype.index.collation_rule)) { - ntfs_error(sb, "Index uses unsupported collation rule 0x%x. " - "Aborting lookup.", le32_to_cpu( - idx_ni->itype.index.collation_rule)); - return -EOPNOTSUPP; - } - /* Get hold of the mft record for the index inode. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return PTR_ERR(m); - } - actx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!actx)) { - err = -ENOMEM; - goto err_out; - } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, actx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in inode " - "0x%lx.", idx_ni->mft_no); - err = -EIO; - } - goto err_out; - } - /* Get to the index root value (it has been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)actx->attr + - le16_to_cpu(actx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)actx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->length) > index_end) - goto idx_err_out; - /* - * The last entry cannot contain a key. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Further bounds checks. */ - if ((u32)sizeof(INDEX_ENTRY_HEADER) + - le16_to_cpu(ie->key_length) > - le16_to_cpu(ie->data.vi.data_offset) || - (u32)le16_to_cpu(ie->data.vi.data_offset) + - le16_to_cpu(ie->data.vi.data_length) > - le16_to_cpu(ie->length)) - goto idx_err_out; - /* If the keys match perfectly, we setup @ictx and return 0. */ - if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, - &ie->key, key_len)) { -ir_done: - ictx->is_in_root = true; - ictx->ir = ir; - ictx->actx = actx; - ictx->base_ni = base_ni; - ictx->ia = NULL; - ictx->page = NULL; -done: - ictx->entry = ie; - ictx->data = (u8*)ie + - le16_to_cpu(ie->data.vi.data_offset); - ictx->data_len = le16_to_cpu(ie->data.vi.data_length); - ntfs_debug("Done."); - return err; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, - key_len, &ie->key, le16_to_cpu(ie->key_length)); - /* - * If @key collates before the key of the current entry, there - * is definitely no such key in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* - * A match should never happen as the memcmp() call should have - * cought it, but we still treat it correctly. - */ - if (!rc) - goto ir_done; - /* The keys are not equal, continue the search. */ - } - /* - * We have finished with this index without success. Check for the - * presence of a child node and if not present setup @ictx and return - * -ENOENT. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - ntfs_debug("Entry not found."); - err = -ENOENT; - goto ir_done; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(idx_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Inode 0x%lx is corrupt or " - "driver bug.", idx_ni->mft_no); - goto err_out; - } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - ia_mapping = VFS_I(idx_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(actx); - unmap_mft_record(base_ni); - m = NULL; - actx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt inode " - "0x%lx or driver bug.", idx_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. " - "Corrupt inode 0x%lx. Run chkdsk.", - (long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). Inode " - "0x%lx is corrupt or driver bug.", - (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - idx_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has " - "a size (%u) differing from the index " - "specified size (%u). Inode is corrupt or " - "driver bug.", (unsigned long long)vcn, - idx_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - idx_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + idx_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " - "crosses page boundary. Impossible! Cannot " - "access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - idx_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + idx_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode " - "0x%lx exceeds maximum size.", - (unsigned long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->length) > index_end) { - ntfs_error(sb, "Index entry out of bounds in inode " - "0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a key. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Further bounds checks. */ - if ((u32)sizeof(INDEX_ENTRY_HEADER) + - le16_to_cpu(ie->key_length) > - le16_to_cpu(ie->data.vi.data_offset) || - (u32)le16_to_cpu(ie->data.vi.data_offset) + - le16_to_cpu(ie->data.vi.data_length) > - le16_to_cpu(ie->length)) { - ntfs_error(sb, "Index entry out of bounds in inode " - "0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* If the keys match perfectly, we setup @ictx and return 0. */ - if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, - &ie->key, key_len)) { -ia_done: - ictx->is_in_root = false; - ictx->actx = NULL; - ictx->base_ni = NULL; - ictx->ia = ia; - ictx->page = page; - goto done; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, - key_len, &ie->key, le16_to_cpu(ie->key_length)); - /* - * If @key collates before the key of the current entry, there - * is definitely no such key in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* - * A match should never happen as the memcmp() call should have - * cought it, but we still treat it correctly. - */ - if (!rc) - goto ia_done; - /* The keys are not equal, continue the search. */ - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node and if not present return -ENOENT. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - ntfs_debug("Entry not found."); - err = -ENOENT; - goto ia_done; - } - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in a leaf " - "node in inode 0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - if (vcn >= 0) { - /* - * If vcn is in the same page cache page as old_vcn we recycle - * the mapped page. - */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in inode 0x%lx.", - idx_ni->mft_no); -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (actx) - ntfs_attr_put_search_ctx(actx); - if (m) - unmap_mft_record(base_ni); - return err; -idx_err_out: - ntfs_error(sb, "Corrupt index. Aborting lookup."); - goto err_out; -} diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h deleted file mode 100644 index bb3c3ae55138..000000000000 --- a/fs/ntfs/index.h +++ /dev/null @@ -1,134 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_INDEX_H -#define _LINUX_NTFS_INDEX_H - -#include <linux/fs.h> - -#include "types.h" -#include "layout.h" -#include "inode.h" -#include "attrib.h" -#include "mft.h" -#include "aops.h" - -/** - * @idx_ni: index inode containing the @entry described by this context - * @entry: index entry (points into @ir or @ia) - * @data: index entry data (points into @entry) - * @data_len: length in bytes of @data - * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia - * @ir: index root if @is_in_root and NULL otherwise - * @actx: attribute search context if @is_in_root and NULL otherwise - * @base_ni: base inode if @is_in_root and NULL otherwise - * @ia: index block if @is_in_root is 'false' and NULL otherwise - * @page: page if @is_in_root is 'false' and NULL otherwise - * - * @idx_ni is the index inode this context belongs to. - * - * @entry is the index entry described by this context. @data and @data_len - * are the index entry data and its length in bytes, respectively. @data - * simply points into @entry. This is probably what the user is interested in. - * - * If @is_in_root is 'true', @entry is in the index root attribute @ir described - * by the attribute search context @actx and the base inode @base_ni. @ia and - * @page are NULL in this case. - * - * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia - * and @page point to the index allocation block and the mapped, locked page it - * is in, respectively. @ir, @actx and @base_ni are NULL in this case. - * - * To obtain a context call ntfs_index_ctx_get(). - * - * We use this context to allow ntfs_index_lookup() to return the found index - * @entry and its @data without having to allocate a buffer and copy the @entry - * and/or its @data into it. - * - * When finished with the @entry and its @data, call ntfs_index_ctx_put() to - * free the context and other associated resources. - * - * If the index entry was modified, call flush_dcache_index_entry_page() - * immediately after the modification and either ntfs_index_entry_mark_dirty() - * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to - * ensure that the changes are written to disk. - */ -typedef struct { - ntfs_inode *idx_ni; - INDEX_ENTRY *entry; - void *data; - u16 data_len; - bool is_in_root; - INDEX_ROOT *ir; - ntfs_attr_search_ctx *actx; - ntfs_inode *base_ni; - INDEX_ALLOCATION *ia; - struct page *page; -} ntfs_index_context; - -extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni); -extern void ntfs_index_ctx_put(ntfs_index_context *ictx); - -extern int ntfs_index_lookup(const void *key, const int key_len, - ntfs_index_context *ictx); - -#ifdef NTFS_RW - -/** - * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries - * @ictx: ntfs index context describing the index entry - * - * Call flush_dcache_page() for the page in which an index entry resides. - * - * This must be called every time an index entry is modified, just after the - * modification. - * - * If the index entry is in the index root attribute, simply flush the page - * containing the mft record containing the index root attribute. - * - * If the index entry is in an index block belonging to the index allocation - * attribute, simply flush the page cache page containing the index block. - */ -static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx) -{ - if (ictx->is_in_root) - flush_dcache_mft_record_page(ictx->actx->ntfs_ino); - else - flush_dcache_page(ictx->page); -} - -/** - * ntfs_index_entry_mark_dirty - mark an index entry dirty - * @ictx: ntfs index context describing the index entry - * - * Mark the index entry described by the index entry context @ictx dirty. - * - * If the index entry is in the index root attribute, simply mark the mft - * record containing the index root attribute dirty. This ensures the mft - * record, and hence the index root attribute, will be written out to disk - * later. - * - * If the index entry is in an index block belonging to the index allocation - * attribute, mark the buffers belonging to the index record as well as the - * page cache page the index block is in dirty. This automatically marks the - * VFS inode of the ntfs index inode to which the index entry belongs dirty, - * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the - * dirty index block, will be written out to disk later. - */ -static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) -{ - if (ictx->is_in_root) - mark_mft_record_dirty(ictx->actx->ntfs_ino); - else - mark_ntfs_record_dirty(ictx->page, - (u8*)ictx->ia - (u8*)page_address(ictx->page)); -} - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_INDEX_H */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c deleted file mode 100644 index aba1e22db4e9..000000000000 --- a/fs/ntfs/inode.c +++ /dev/null @@ -1,3102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * inode.c - NTFS kernel inode handling. - * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. - */ - -#include <linux/buffer_head.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/mount.h> -#include <linux/mutex.h> -#include <linux/pagemap.h> -#include <linux/quotaops.h> -#include <linux/slab.h> -#include <linux/log2.h> - -#include "aops.h" -#include "attrib.h" -#include "bitmap.h" -#include "dir.h" -#include "debug.h" -#include "inode.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "time.h" -#include "ntfs.h" - -/** - * ntfs_test_inode - compare two (possibly fake) inodes for equality - * @vi: vfs inode which to test - * @data: data which is being tested with - * - * Compare the ntfs attribute embedded in the ntfs specific part of the vfs - * inode @vi for equality with the ntfs attribute @data. - * - * If searching for the normal file/directory inode, set @na->type to AT_UNUSED. - * @na->name and @na->name_len are then ignored. - * - * Return 1 if the attributes match and 0 if not. - * - * NOTE: This function runs with the inode_hash_lock spin lock held so it is not - * allowed to sleep. - */ -int ntfs_test_inode(struct inode *vi, void *data) -{ - ntfs_attr *na = (ntfs_attr *)data; - ntfs_inode *ni; - - if (vi->i_ino != na->mft_no) - return 0; - ni = NTFS_I(vi); - /* If !NInoAttr(ni), @vi is a normal file or directory inode. */ - if (likely(!NInoAttr(ni))) { - /* If not looking for a normal inode this is a mismatch. */ - if (unlikely(na->type != AT_UNUSED)) - return 0; - } else { - /* A fake inode describing an attribute. */ - if (ni->type != na->type) - return 0; - if (ni->name_len != na->name_len) - return 0; - if (na->name_len && memcmp(ni->name, na->name, - na->name_len * sizeof(ntfschar))) - return 0; - } - /* Match! */ - return 1; -} - -/** - * ntfs_init_locked_inode - initialize an inode - * @vi: vfs inode to initialize - * @data: data which to initialize @vi to - * - * Initialize the vfs inode @vi with the values from the ntfs attribute @data in - * order to enable ntfs_test_inode() to do its work. - * - * If initializing the normal file/directory inode, set @na->type to AT_UNUSED. - * In that case, @na->name and @na->name_len should be set to NULL and 0, - * respectively. Although that is not strictly necessary as - * ntfs_read_locked_inode() will fill them in later. - * - * Return 0 on success and -errno on error. - * - * NOTE: This function runs with the inode->i_lock spin lock held so it is not - * allowed to sleep. (Hence the GFP_ATOMIC allocation.) - */ -static int ntfs_init_locked_inode(struct inode *vi, void *data) -{ - ntfs_attr *na = (ntfs_attr *)data; - ntfs_inode *ni = NTFS_I(vi); - - vi->i_ino = na->mft_no; - - ni->type = na->type; - if (na->type == AT_INDEX_ALLOCATION) - NInoSetMstProtected(ni); - - ni->name = na->name; - ni->name_len = na->name_len; - - /* If initializing a normal inode, we are done. */ - if (likely(na->type == AT_UNUSED)) { - BUG_ON(na->name); - BUG_ON(na->name_len); - return 0; - } - - /* It is a fake inode. */ - NInoSetAttr(ni); - - /* - * We have I30 global constant as an optimization as it is the name - * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC - * allocation but that is ok. And most attributes are unnamed anyway, - * thus the fraction of named attributes with name != I30 is actually - * absolutely tiny. - */ - if (na->name_len && na->name != I30) { - unsigned int i; - - BUG_ON(!na->name); - i = na->name_len * sizeof(ntfschar); - ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC); - if (!ni->name) - return -ENOMEM; - memcpy(ni->name, na->name, i); - ni->name[na->name_len] = 0; - } - return 0; -} - -static int ntfs_read_locked_inode(struct inode *vi); -static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi); -static int ntfs_read_locked_index_inode(struct inode *base_vi, - struct inode *vi); - -/** - * ntfs_iget - obtain a struct inode corresponding to a specific normal inode - * @sb: super block of mounted volume - * @mft_no: mft record number / inode number to obtain - * - * Obtain the struct inode corresponding to a specific normal inode (i.e. a - * file or directory). - * - * If the inode is in the cache, it is just returned with an increased - * reference count. Otherwise, a new struct inode is allocated and initialized, - * and finally ntfs_read_locked_inode() is called to read in the inode and - * fill in the remainder of the inode structure. - * - * Return the struct inode on success. Check the return value with IS_ERR() and - * if true, the function failed and the error code is obtained from PTR_ERR(). - */ -struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) -{ - struct inode *vi; - int err; - ntfs_attr na; - - na.mft_no = mft_no; - na.type = AT_UNUSED; - na.name = NULL; - na.name_len = 0; - - vi = iget5_locked(sb, mft_no, ntfs_test_inode, - ntfs_init_locked_inode, &na); - if (unlikely(!vi)) - return ERR_PTR(-ENOMEM); - - err = 0; - - /* If this is a freshly allocated inode, need to read it now. */ - if (vi->i_state & I_NEW) { - err = ntfs_read_locked_inode(vi); - unlock_new_inode(vi); - } - /* - * There is no point in keeping bad inodes around if the failure was - * due to ENOMEM. We want to be able to retry again later. - */ - if (unlikely(err == -ENOMEM)) { - iput(vi); - vi = ERR_PTR(err); - } - return vi; -} - -/** - * ntfs_attr_iget - obtain a struct inode corresponding to an attribute - * @base_vi: vfs base inode containing the attribute - * @type: attribute type - * @name: Unicode name of the attribute (NULL if unnamed) - * @name_len: length of @name in Unicode characters (0 if unnamed) - * - * Obtain the (fake) struct inode corresponding to the attribute specified by - * @type, @name, and @name_len, which is present in the base mft record - * specified by the vfs inode @base_vi. - * - * If the attribute inode is in the cache, it is just returned with an - * increased reference count. Otherwise, a new struct inode is allocated and - * initialized, and finally ntfs_read_locked_attr_inode() is called to read the - * attribute and fill in the inode structure. - * - * Note, for index allocation attributes, you need to use ntfs_index_iget() - * instead of ntfs_attr_iget() as working with indices is a lot more complex. - * - * Return the struct inode of the attribute inode on success. Check the return - * value with IS_ERR() and if true, the function failed and the error code is - * obtained from PTR_ERR(). - */ -struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, - ntfschar *name, u32 name_len) -{ - struct inode *vi; - int err; - ntfs_attr na; - - /* Make sure no one calls ntfs_attr_iget() for indices. */ - BUG_ON(type == AT_INDEX_ALLOCATION); - - na.mft_no = base_vi->i_ino; - na.type = type; - na.name = name; - na.name_len = name_len; - - vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, - ntfs_init_locked_inode, &na); - if (unlikely(!vi)) - return ERR_PTR(-ENOMEM); - - err = 0; - - /* If this is a freshly allocated inode, need to read it now. */ - if (vi->i_state & I_NEW) { - err = ntfs_read_locked_attr_inode(base_vi, vi); - unlock_new_inode(vi); - } - /* - * There is no point in keeping bad attribute inodes around. This also - * simplifies things in that we never need to check for bad attribute - * inodes elsewhere. - */ - if (unlikely(err)) { - iput(vi); - vi = ERR_PTR(err); - } - return vi; -} - -/** - * ntfs_index_iget - obtain a struct inode corresponding to an index - * @base_vi: vfs base inode containing the index related attributes - * @name: Unicode name of the index - * @name_len: length of @name in Unicode characters - * - * Obtain the (fake) struct inode corresponding to the index specified by @name - * and @name_len, which is present in the base mft record specified by the vfs - * inode @base_vi. - * - * If the index inode is in the cache, it is just returned with an increased - * reference count. Otherwise, a new struct inode is allocated and - * initialized, and finally ntfs_read_locked_index_inode() is called to read - * the index related attributes and fill in the inode structure. - * - * Return the struct inode of the index inode on success. Check the return - * value with IS_ERR() and if true, the function failed and the error code is - * obtained from PTR_ERR(). - */ -struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, - u32 name_len) -{ - struct inode *vi; - int err; - ntfs_attr na; - - na.mft_no = base_vi->i_ino; - na.type = AT_INDEX_ALLOCATION; - na.name = name; - na.name_len = name_len; - - vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, - ntfs_init_locked_inode, &na); - if (unlikely(!vi)) - return ERR_PTR(-ENOMEM); - - err = 0; - - /* If this is a freshly allocated inode, need to read it now. */ - if (vi->i_state & I_NEW) { - err = ntfs_read_locked_index_inode(base_vi, vi); - unlock_new_inode(vi); - } - /* - * There is no point in keeping bad index inodes around. This also - * simplifies things in that we never need to check for bad index - * inodes elsewhere. - */ - if (unlikely(err)) { - iput(vi); - vi = ERR_PTR(err); - } - return vi; -} - -struct inode *ntfs_alloc_big_inode(struct super_block *sb) -{ - ntfs_inode *ni; - - ntfs_debug("Entering."); - ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS); - if (likely(ni != NULL)) { - ni->state = 0; - return VFS_I(ni); - } - ntfs_error(sb, "Allocation of NTFS big inode structure failed."); - return NULL; -} - -void ntfs_free_big_inode(struct inode *inode) -{ - kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); -} - -static inline ntfs_inode *ntfs_alloc_extent_inode(void) -{ - ntfs_inode *ni; - - ntfs_debug("Entering."); - ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS); - if (likely(ni != NULL)) { - ni->state = 0; - return ni; - } - ntfs_error(NULL, "Allocation of NTFS inode structure failed."); - return NULL; -} - -static void ntfs_destroy_extent_inode(ntfs_inode *ni) -{ - ntfs_debug("Entering."); - BUG_ON(ni->page); - if (!atomic_dec_and_test(&ni->count)) - BUG(); - kmem_cache_free(ntfs_inode_cache, ni); -} - -/* - * The attribute runlist lock has separate locking rules from the - * normal runlist lock, so split the two lock-classes: - */ -static struct lock_class_key attr_list_rl_lock_class; - -/** - * __ntfs_init_inode - initialize ntfs specific part of an inode - * @sb: super block of mounted volume - * @ni: freshly allocated ntfs inode which to initialize - * - * Initialize an ntfs inode to defaults. - * - * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left - * untouched. Make sure to initialize them elsewhere. - * - * Return zero on success and -ENOMEM on error. - */ -void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni) -{ - ntfs_debug("Entering."); - rwlock_init(&ni->size_lock); - ni->initialized_size = ni->allocated_size = 0; - ni->seq_no = 0; - atomic_set(&ni->count, 1); - ni->vol = NTFS_SB(sb); - ntfs_init_runlist(&ni->runlist); - mutex_init(&ni->mrec_lock); - ni->page = NULL; - ni->page_ofs = 0; - ni->attr_list_size = 0; - ni->attr_list = NULL; - ntfs_init_runlist(&ni->attr_list_rl); - lockdep_set_class(&ni->attr_list_rl.lock, - &attr_list_rl_lock_class); - ni->itype.index.block_size = 0; - ni->itype.index.vcn_size = 0; - ni->itype.index.collation_rule = 0; - ni->itype.index.block_size_bits = 0; - ni->itype.index.vcn_size_bits = 0; - mutex_init(&ni->extent_lock); - ni->nr_extents = 0; - ni->ext.base_ntfs_ino = NULL; -} - -/* - * Extent inodes get MFT-mapped in a nested way, while the base inode - * is still mapped. Teach this nesting to the lock validator by creating - * a separate class for nested inode's mrec_lock's: - */ -static struct lock_class_key extent_inode_mrec_lock_key; - -inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, - unsigned long mft_no) -{ - ntfs_inode *ni = ntfs_alloc_extent_inode(); - - ntfs_debug("Entering."); - if (likely(ni != NULL)) { - __ntfs_init_inode(sb, ni); - lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key); - ni->mft_no = mft_no; - ni->type = AT_UNUSED; - ni->name = NULL; - ni->name_len = 0; - } - return ni; -} - -/** - * ntfs_is_extended_system_file - check if a file is in the $Extend directory - * @ctx: initialized attribute search context - * - * Search all file name attributes in the inode described by the attribute - * search context @ctx and check if any of the names are in the $Extend system - * directory. - * - * Return values: - * 1: file is in $Extend directory - * 0: file is not in $Extend directory - * -errno: failed to determine if the file is in the $Extend directory - */ -static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx) -{ - int nr_links, err; - - /* Restart search. */ - ntfs_attr_reinit_search_ctx(ctx); - - /* Get number of hard links. */ - nr_links = le16_to_cpu(ctx->mrec->link_count); - - /* Loop through all hard links. */ - while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0, - ctx))) { - FILE_NAME_ATTR *file_name_attr; - ATTR_RECORD *attr = ctx->attr; - u8 *p, *p2; - - nr_links--; - /* - * Maximum sanity checking as we are called on an inode that - * we suspect might be corrupt. - */ - p = (u8*)attr + le32_to_cpu(attr->length); - if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_in_use)) { -err_corrupt_attr: - ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name " - "attribute. You should run chkdsk."); - return -EIO; - } - if (attr->non_resident) { - ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file " - "name. You should run chkdsk."); - return -EIO; - } - if (attr->flags) { - ntfs_error(ctx->ntfs_ino->vol->sb, "File name with " - "invalid flags. You should run " - "chkdsk."); - return -EIO; - } - if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) { - ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file " - "name. You should run chkdsk."); - return -EIO; - } - file_name_attr = (FILE_NAME_ATTR*)((u8*)attr + - le16_to_cpu(attr->data.resident.value_offset)); - p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length); - if (p2 < (u8*)attr || p2 > p) - goto err_corrupt_attr; - /* This attribute is ok, but is it in the $Extend directory? */ - if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend) - return 1; /* YES, it's an extended system file. */ - } - if (unlikely(err != -ENOENT)) - return err; - if (unlikely(nr_links)) { - ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count " - "doesn't match number of name attributes. You " - "should run chkdsk."); - return -EIO; - } - return 0; /* NO, it is not an extended system file. */ -} - -/** - * ntfs_read_locked_inode - read an inode from its device - * @vi: inode to read - * - * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode - * described by @vi into memory from the device. - * - * The only fields in @vi that we need to/can look at when the function is - * called are i_sb, pointing to the mounted device's super block, and i_ino, - * the number of the inode to load. - * - * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino - * for reading and sets up the necessary @vi fields as well as initializing - * the ntfs inode. - * - * Q: What locks are held when the function is called? - * A: i_state has I_NEW set, hence the inode is locked, also - * i_count is set to 1, so it is not going to go away - * i_flags is set to 0 and we have no business touching it. Only an ioctl() - * is allowed to write to them. We should of course be honouring them but - * we need to do that using the IS_* macros defined in include/linux/fs.h. - * In any case ntfs_read_locked_inode() has nothing to do with i_flags. - * - * Return 0 on success and -errno on error. In the error case, the inode will - * have had make_bad_inode() executed on it. - */ -static int ntfs_read_locked_inode(struct inode *vi) -{ - ntfs_volume *vol = NTFS_SB(vi->i_sb); - ntfs_inode *ni; - struct inode *bvi; - MFT_RECORD *m; - ATTR_RECORD *a; - STANDARD_INFORMATION *si; - ntfs_attr_search_ctx *ctx; - int err = 0; - - ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); - - /* Setup the generic vfs inode parts now. */ - vi->i_uid = vol->uid; - vi->i_gid = vol->gid; - vi->i_mode = 0; - - /* - * Initialize the ntfs specific part of @vi special casing - * FILE_MFT which we need to do at mount time. - */ - if (vi->i_ino != FILE_MFT) - ntfs_init_big_inode(vi); - ni = NTFS_I(vi); - - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); - if (!ctx) { - err = -ENOMEM; - goto unm_err_out; - } - - if (!(m->flags & MFT_RECORD_IN_USE)) { - ntfs_error(vi->i_sb, "Inode is not in use!"); - goto unm_err_out; - } - if (m->base_mft_record) { - ntfs_error(vi->i_sb, "Inode is an extent inode!"); - goto unm_err_out; - } - - /* Transfer information from mft record into vfs and ntfs inodes. */ - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - - /* - * FIXME: Keep in mind that link_count is two for files which have both - * a long file name and a short file name as separate entries, so if - * we are hiding short file names this will be too high. Either we need - * to account for the short file names by subtracting them or we need - * to make sure we delete files even though i_nlink is not zero which - * might be tricky due to vfs interactions. Need to think about this - * some more when implementing the unlink command. - */ - set_nlink(vi, le16_to_cpu(m->link_count)); - /* - * FIXME: Reparse points can have the directory bit set even though - * they would be S_IFLNK. Need to deal with this further below when we - * implement reparse points / symbolic links but it will do for now. - * Also if not a directory, it could be something else, rather than - * a regular file. But again, will do for now. - */ - /* Everyone gets all permissions. */ - vi->i_mode |= S_IRWXUGO; - /* If read-only, no one gets write permissions. */ - if (IS_RDONLY(vi)) - vi->i_mode &= ~S_IWUGO; - if (m->flags & MFT_RECORD_IS_DIRECTORY) { - vi->i_mode |= S_IFDIR; - /* - * Apply the directory permissions mask set in the mount - * options. - */ - vi->i_mode &= ~vol->dmask; - /* Things break without this kludge! */ - if (vi->i_nlink > 1) - set_nlink(vi, 1); - } else { - vi->i_mode |= S_IFREG; - /* Apply the file permissions mask set in the mount options. */ - vi->i_mode &= ~vol->fmask; - } - /* - * Find the standard information attribute in the mft record. At this - * stage we haven't setup the attribute list stuff yet, so this could - * in fact fail if the standard information is in an extent record, but - * I don't think this actually ever happens. - */ - err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0, - ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - /* - * TODO: We should be performing a hot fix here (if the - * recover mount option is set) by creating a new - * attribute. - */ - ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute " - "is missing."); - } - goto unm_err_out; - } - a = ctx->attr; - /* Get the standard information attribute value. */ - if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset) - + le32_to_cpu(a->data.resident.value_length) > - (u8 *)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode."); - goto unm_err_out; - } - si = (STANDARD_INFORMATION*)((u8*)a + - le16_to_cpu(a->data.resident.value_offset)); - - /* Transfer information from the standard information into vi. */ - /* - * Note: The i_?times do not quite map perfectly onto the NTFS times, - * but they are close enough, and in the end it doesn't really matter - * that much... - */ - /* - * mtime is the last change of the data within the file. Not changed - * when only metadata is changed, e.g. a rename doesn't affect mtime. - */ - inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time)); - /* - * ctime is the last change of the metadata of the file. This obviously - * always changes, when mtime is changed. ctime can be changed on its - * own, mtime is then not changed, e.g. when a file is renamed. - */ - inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time)); - /* - * Last access to the data within the file. Not changed during a rename - * for example but changed whenever the file is written to. - */ - inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time)); - - /* Find the attribute list attribute if present. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); - if (err) { - if (unlikely(err != -ENOENT)) { - ntfs_error(vi->i_sb, "Failed to lookup attribute list " - "attribute."); - goto unm_err_out; - } - } else /* if (!err) */ { - if (vi->i_ino == FILE_MFT) - goto skip_attr_list_load; - ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); - NInoSetAttrList(ni); - a = ctx->attr; - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "Attribute list attribute is " - "compressed."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_IS_SPARSE) { - if (a->non_resident) { - ntfs_error(vi->i_sb, "Non-resident attribute " - "list attribute is encrypted/" - "sparse."); - goto unm_err_out; - } - ntfs_warning(vi->i_sb, "Resident attribute list " - "attribute in inode 0x%lx is marked " - "encrypted/sparse which is not true. " - "However, Windows allows this and " - "chkdsk does not detect or correct it " - "so we will just ignore the invalid " - "flags and pretend they are not set.", - vi->i_ino); - } - /* Now allocate memory for the attribute list. */ - ni->attr_list_size = (u32)ntfs_attr_size(a); - ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); - if (!ni->attr_list) { - ntfs_error(vi->i_sb, "Not enough memory to allocate " - "buffer for attribute list."); - err = -ENOMEM; - goto unm_err_out; - } - if (a->non_resident) { - NInoSetAttrListNonResident(ni); - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "Attribute list has non " - "zero lowest_vcn."); - goto unm_err_out; - } - /* - * Setup the runlist. No need for locking as we have - * exclusive access to the inode at this time. - */ - ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, - a, NULL); - if (IS_ERR(ni->attr_list_rl.rl)) { - err = PTR_ERR(ni->attr_list_rl.rl); - ni->attr_list_rl.rl = NULL; - ntfs_error(vi->i_sb, "Mapping pairs " - "decompression failed."); - goto unm_err_out; - } - /* Now load the attribute list. */ - if ((err = load_attribute_list(vol, &ni->attr_list_rl, - ni->attr_list, ni->attr_list_size, - sle64_to_cpu(a->data.non_resident. - initialized_size)))) { - ntfs_error(vi->i_sb, "Failed to load " - "attribute list attribute."); - goto unm_err_out; - } - } else /* if (!a->non_resident) */ { - if ((u8*)a + le16_to_cpu(a->data.resident.value_offset) - + le32_to_cpu( - a->data.resident.value_length) > - (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "Corrupt attribute list " - "in inode."); - goto unm_err_out; - } - /* Now copy the attribute list. */ - memcpy(ni->attr_list, (u8*)a + le16_to_cpu( - a->data.resident.value_offset), - le32_to_cpu( - a->data.resident.value_length)); - } - } -skip_attr_list_load: - /* - * If an attribute list is present we now have the attribute list value - * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes. - */ - if (S_ISDIR(vi->i_mode)) { - loff_t bvi_size; - ntfs_inode *bni; - INDEX_ROOT *ir; - u8 *ir_end, *index_end; - - /* It is a directory, find index root attribute. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, - 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - // FIXME: File is corrupt! Hot-fix with empty - // index root attribute if recovery option is - // set. - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute " - "is missing."); - } - goto unm_err_out; - } - a = ctx->attr; - /* Set up the state. */ - if (unlikely(a->non_resident)) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute is not " - "resident."); - goto unm_err_out; - } - /* Ensure the attribute name is placed before the value. */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu(a->data.resident.value_offset)))) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute name is " - "placed after the attribute value."); - goto unm_err_out; - } - /* - * Compressed/encrypted index root just means that the newly - * created files in that directory should be created compressed/ - * encrypted. However index root cannot be both compressed and - * encrypted. - */ - if (a->flags & ATTR_COMPRESSION_MASK) - NInoSetCompressed(ni); - if (a->flags & ATTR_IS_ENCRYPTED) { - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "Found encrypted and " - "compressed attribute."); - goto unm_err_out; - } - NInoSetEncrypted(ni); - } - if (a->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - ir = (INDEX_ROOT*)((u8*)a + - le16_to_cpu(a->data.resident.value_offset)); - ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); - if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " - "corrupt."); - goto unm_err_out; - } - index_end = (u8*)&ir->index + - le32_to_cpu(ir->index.index_length); - if (index_end > ir_end) { - ntfs_error(vi->i_sb, "Directory index is corrupt."); - goto unm_err_out; - } - if (ir->type != AT_FILE_NAME) { - ntfs_error(vi->i_sb, "Indexed attribute is not " - "$FILE_NAME."); - goto unm_err_out; - } - if (ir->collation_rule != COLLATION_FILE_NAME) { - ntfs_error(vi->i_sb, "Index collation rule is not " - "COLLATION_FILE_NAME."); - goto unm_err_out; - } - ni->itype.index.collation_rule = ir->collation_rule; - ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); - if (ni->itype.index.block_size & - (ni->itype.index.block_size - 1)) { - ntfs_error(vi->i_sb, "Index block size (%u) is not a " - "power of two.", - ni->itype.index.block_size); - goto unm_err_out; - } - if (ni->itype.index.block_size > PAGE_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) > " - "PAGE_SIZE (%ld) is not " - "supported. Sorry.", - ni->itype.index.block_size, - PAGE_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) < " - "NTFS_BLOCK_SIZE (%i) is not " - "supported. Sorry.", - ni->itype.index.block_size, - NTFS_BLOCK_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - ni->itype.index.block_size_bits = - ffs(ni->itype.index.block_size) - 1; - /* Determine the size of a vcn in the directory index. */ - if (vol->cluster_size <= ni->itype.index.block_size) { - ni->itype.index.vcn_size = vol->cluster_size; - ni->itype.index.vcn_size_bits = vol->cluster_size_bits; - } else { - ni->itype.index.vcn_size = vol->sector_size; - ni->itype.index.vcn_size_bits = vol->sector_size_bits; - } - - /* Setup the index allocation attribute, even if not present. */ - NInoSetMstProtected(ni); - ni->type = AT_INDEX_ALLOCATION; - ni->name = I30; - ni->name_len = 4; - - if (!(ir->index.flags & LARGE_INDEX)) { - /* No index allocation. */ - vi->i_size = ni->initialized_size = - ni->allocated_size = 0; - /* We are done with the mft record, so we release it. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - m = NULL; - ctx = NULL; - goto skip_large_dir_stuff; - } /* LARGE_INDEX: Index allocation present. Setup state. */ - NInoSetIndexAllocPresent(ni); - /* Find index allocation attribute. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION " - "attribute is not present but " - "$INDEX_ROOT indicated it is."); - else - ntfs_error(vi->i_sb, "Failed to lookup " - "$INDEX_ALLOCATION " - "attribute."); - goto unm_err_out; - } - a = ctx->attr; - if (!a->non_resident) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is resident."); - goto unm_err_out; - } - /* - * Ensure the attribute name is placed before the mapping pairs - * array. - */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)))) { - ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name " - "is placed after the mapping pairs " - "array."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is encrypted."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_SPARSE) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is sparse."); - goto unm_err_out; - } - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " - "is compressed."); - goto unm_err_out; - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of " - "$INDEX_ALLOCATION attribute has non " - "zero lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - /* - * We are done with the mft record, so we release it. Otherwise - * we would deadlock in ntfs_attr_iget(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - m = NULL; - ctx = NULL; - /* Get the index bitmap attribute inode. */ - bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4); - if (IS_ERR(bvi)) { - ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bvi); - goto unm_err_out; - } - bni = NTFS_I(bvi); - if (NInoCompressed(bni) || NInoEncrypted(bni) || - NInoSparse(bni)) { - ntfs_error(vi->i_sb, "$BITMAP attribute is compressed " - "and/or encrypted and/or sparse."); - goto iput_unm_err_out; - } - /* Consistency check bitmap size vs. index allocation size. */ - bvi_size = i_size_read(bvi); - if ((bvi_size << 3) < (vi->i_size >> - ni->itype.index.block_size_bits)) { - ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) " - "for index allocation (0x%llx).", - bvi_size << 3, vi->i_size); - goto iput_unm_err_out; - } - /* No longer need the bitmap attribute inode. */ - iput(bvi); -skip_large_dir_stuff: - /* Setup the operations for this inode. */ - vi->i_op = &ntfs_dir_inode_ops; - vi->i_fop = &ntfs_dir_ops; - vi->i_mapping->a_ops = &ntfs_mst_aops; - } else { - /* It is a file. */ - ntfs_attr_reinit_search_ctx(ctx); - - /* Setup the data attribute, even if not present. */ - ni->type = AT_DATA; - ni->name = NULL; - ni->name_len = 0; - - /* Find first extent of the unnamed data attribute. */ - err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx); - if (unlikely(err)) { - vi->i_size = ni->initialized_size = - ni->allocated_size = 0; - if (err != -ENOENT) { - ntfs_error(vi->i_sb, "Failed to lookup $DATA " - "attribute."); - goto unm_err_out; - } - /* - * FILE_Secure does not have an unnamed $DATA - * attribute, so we special case it here. - */ - if (vi->i_ino == FILE_Secure) - goto no_data_attr_special_case; - /* - * Most if not all the system files in the $Extend - * system directory do not have unnamed data - * attributes so we need to check if the parent - * directory of the file is FILE_Extend and if it is - * ignore this error. To do this we need to get the - * name of this inode from the mft record as the name - * contains the back reference to the parent directory. - */ - if (ntfs_is_extended_system_file(ctx) > 0) - goto no_data_attr_special_case; - // FIXME: File is corrupt! Hot-fix with empty data - // attribute if recovery option is set. - ntfs_error(vi->i_sb, "$DATA attribute is missing."); - goto unm_err_out; - } - a = ctx->attr; - /* Setup the state. */ - if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { - if (a->flags & ATTR_COMPRESSION_MASK) { - NInoSetCompressed(ni); - if (vol->cluster_size > 4096) { - ntfs_error(vi->i_sb, "Found " - "compressed data but " - "compression is " - "disabled due to " - "cluster size (%i) > " - "4kiB.", - vol->cluster_size); - goto unm_err_out; - } - if ((a->flags & ATTR_COMPRESSION_MASK) - != ATTR_IS_COMPRESSED) { - ntfs_error(vi->i_sb, "Found unknown " - "compression method " - "or corrupt file."); - goto unm_err_out; - } - } - if (a->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - } - if (a->flags & ATTR_IS_ENCRYPTED) { - if (NInoCompressed(ni)) { - ntfs_error(vi->i_sb, "Found encrypted and " - "compressed data."); - goto unm_err_out; - } - NInoSetEncrypted(ni); - } - if (a->non_resident) { - NInoSetNonResident(ni); - if (NInoCompressed(ni) || NInoSparse(ni)) { - if (NInoCompressed(ni) && a->data.non_resident. - compression_unit != 4) { - ntfs_error(vi->i_sb, "Found " - "non-standard " - "compression unit (%u " - "instead of 4). " - "Cannot handle this.", - a->data.non_resident. - compression_unit); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << - (a->data.non_resident. - compression_unit + - vol->cluster_size_bits); - ni->itype.compressed.block_size_bits = - ffs(ni->itype. - compressed. - block_size) - 1; - ni->itype.compressed.block_clusters = - 1U << a->data. - non_resident. - compression_unit; - } else { - ni->itype.compressed.block_size = 0; - ni->itype.compressed.block_size_bits = - 0; - ni->itype.compressed.block_clusters = - 0; - } - ni->itype.compressed.size = sle64_to_cpu( - a->data.non_resident. - compressed_size); - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of $DATA " - "attribute has non zero " - "lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu( - a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - } else { /* Resident attribute. */ - vi->i_size = ni->initialized_size = le32_to_cpu( - a->data.resident.value_length); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu( - a->data.resident.value_offset); - if (vi->i_size > ni->allocated_size) { - ntfs_error(vi->i_sb, "Resident data attribute " - "is corrupt (size exceeds " - "allocation)."); - goto unm_err_out; - } - } -no_data_attr_special_case: - /* We are done with the mft record, so we release it. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - m = NULL; - ctx = NULL; - /* Setup the operations for this inode. */ - vi->i_op = &ntfs_file_inode_ops; - vi->i_fop = &ntfs_file_ops; - vi->i_mapping->a_ops = &ntfs_normal_aops; - if (NInoMstProtected(ni)) - vi->i_mapping->a_ops = &ntfs_mst_aops; - else if (NInoCompressed(ni)) - vi->i_mapping->a_ops = &ntfs_compressed_aops; - } - /* - * The number of 512-byte blocks used on disk (for stat). This is in so - * far inaccurate as it doesn't account for any named streams or other - * special non-resident attributes, but that is how Windows works, too, - * so we are at least consistent with Windows, if not entirely - * consistent with the Linux Way. Doing it the Linux Way would cause a - * significant slowdown as it would involve iterating over all - * attributes in the mft record and adding the allocated/compressed - * sizes of all non-resident attributes present to give us the Linux - * correct size that should go into i_blocks (after division by 512). - */ - if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni))) - vi->i_blocks = ni->itype.compressed.size >> 9; - else - vi->i_blocks = ni->allocated_size >> 9; - ntfs_debug("Done."); - return 0; -iput_unm_err_out: - iput(bvi); -unm_err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ni); -err_out: - ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt " - "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino); - make_bad_inode(vi); - if (err != -EOPNOTSUPP && err != -ENOMEM) - NVolSetErrors(vol); - return err; -} - -/** - * ntfs_read_locked_attr_inode - read an attribute inode from its base inode - * @base_vi: base inode - * @vi: attribute inode to read - * - * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the - * attribute inode described by @vi into memory from the base mft record - * described by @base_ni. - * - * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for - * reading and looks up the attribute described by @vi before setting up the - * necessary fields in @vi as well as initializing the ntfs inode. - * - * Q: What locks are held when the function is called? - * A: i_state has I_NEW set, hence the inode is locked, also - * i_count is set to 1, so it is not going to go away - * - * Return 0 on success and -errno on error. In the error case, the inode will - * have had make_bad_inode() executed on it. - * - * Note this cannot be called for AT_INDEX_ALLOCATION. - */ -static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) -{ - ntfs_volume *vol = NTFS_SB(vi->i_sb); - ntfs_inode *ni, *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - int err = 0; - - ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); - - ntfs_init_big_inode(vi); - - ni = NTFS_I(vi); - base_ni = NTFS_I(base_vi); - - /* Just mirror the values from the base inode. */ - vi->i_uid = base_vi->i_uid; - vi->i_gid = base_vi->i_gid; - set_nlink(vi, base_vi->i_nlink); - inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); - inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); - inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); - vi->i_generation = ni->seq_no = base_ni->seq_no; - - /* Set inode type to zero but preserve permissions. */ - vi->i_mode = base_vi->i_mode & ~S_IFMT; - - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (!ctx) { - err = -ENOMEM; - goto unm_err_out; - } - /* Find the attribute. */ - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto unm_err_out; - a = ctx->attr; - if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { - if (a->flags & ATTR_COMPRESSION_MASK) { - NInoSetCompressed(ni); - if ((ni->type != AT_DATA) || (ni->type == AT_DATA && - ni->name_len)) { - ntfs_error(vi->i_sb, "Found compressed " - "non-data or named data " - "attribute. Please report " - "you saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - goto unm_err_out; - } - if (vol->cluster_size > 4096) { - ntfs_error(vi->i_sb, "Found compressed " - "attribute but compression is " - "disabled due to cluster size " - "(%i) > 4kiB.", - vol->cluster_size); - goto unm_err_out; - } - if ((a->flags & ATTR_COMPRESSION_MASK) != - ATTR_IS_COMPRESSED) { - ntfs_error(vi->i_sb, "Found unknown " - "compression method."); - goto unm_err_out; - } - } - /* - * The compressed/sparse flag set in an index root just means - * to compress all files. - */ - if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { - ntfs_error(vi->i_sb, "Found mst protected attribute " - "but the attribute is %s. Please " - "report you saw this message to " - "linux-ntfs-dev@lists.sourceforge.net", - NInoCompressed(ni) ? "compressed" : - "sparse"); - goto unm_err_out; - } - if (a->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - } - if (a->flags & ATTR_IS_ENCRYPTED) { - if (NInoCompressed(ni)) { - ntfs_error(vi->i_sb, "Found encrypted and compressed " - "data."); - goto unm_err_out; - } - /* - * The encryption flag set in an index root just means to - * encrypt all files. - */ - if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { - ntfs_error(vi->i_sb, "Found mst protected attribute " - "but the attribute is encrypted. " - "Please report you saw this message " - "to linux-ntfs-dev@lists.sourceforge." - "net"); - goto unm_err_out; - } - if (ni->type != AT_DATA) { - ntfs_error(vi->i_sb, "Found encrypted non-data " - "attribute."); - goto unm_err_out; - } - NInoSetEncrypted(ni); - } - if (!a->non_resident) { - /* Ensure the attribute name is placed before the value. */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu(a->data.resident.value_offset)))) { - ntfs_error(vol->sb, "Attribute name is placed after " - "the attribute value."); - goto unm_err_out; - } - if (NInoMstProtected(ni)) { - ntfs_error(vi->i_sb, "Found mst protected attribute " - "but the attribute is resident. " - "Please report you saw this message to " - "linux-ntfs-dev@lists.sourceforge.net"); - goto unm_err_out; - } - vi->i_size = ni->initialized_size = le32_to_cpu( - a->data.resident.value_length); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - if (vi->i_size > ni->allocated_size) { - ntfs_error(vi->i_sb, "Resident attribute is corrupt " - "(size exceeds allocation)."); - goto unm_err_out; - } - } else { - NInoSetNonResident(ni); - /* - * Ensure the attribute name is placed before the mapping pairs - * array. - */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)))) { - ntfs_error(vol->sb, "Attribute name is placed after " - "the mapping pairs array."); - goto unm_err_out; - } - if (NInoCompressed(ni) || NInoSparse(ni)) { - if (NInoCompressed(ni) && a->data.non_resident. - compression_unit != 4) { - ntfs_error(vi->i_sb, "Found non-standard " - "compression unit (%u instead " - "of 4). Cannot handle this.", - a->data.non_resident. - compression_unit); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << - (a->data.non_resident. - compression_unit + - vol->cluster_size_bits); - ni->itype.compressed.block_size_bits = - ffs(ni->itype.compressed. - block_size) - 1; - ni->itype.compressed.block_clusters = 1U << - a->data.non_resident. - compression_unit; - } else { - ni->itype.compressed.block_size = 0; - ni->itype.compressed.block_size_bits = 0; - ni->itype.compressed.block_clusters = 0; - } - ni->itype.compressed.size = sle64_to_cpu( - a->data.non_resident.compressed_size); - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of attribute has " - "non-zero lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - } - vi->i_mapping->a_ops = &ntfs_normal_aops; - if (NInoMstProtected(ni)) - vi->i_mapping->a_ops = &ntfs_mst_aops; - else if (NInoCompressed(ni)) - vi->i_mapping->a_ops = &ntfs_compressed_aops; - if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT) - vi->i_blocks = ni->itype.compressed.size >> 9; - else - vi->i_blocks = ni->allocated_size >> 9; - /* - * Make sure the base inode does not go away and attach it to the - * attribute inode. - */ - igrab(base_vi); - ni->ext.base_ntfs_ino = base_ni; - ni->nr_extents = -1; - - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - - ntfs_debug("Done."); - return 0; - -unm_err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); -err_out: - ntfs_error(vol->sb, "Failed with error code %i while reading attribute " - "inode (mft_no 0x%lx, type 0x%x, name_len %i). " - "Marking corrupt inode and base inode 0x%lx as bad. " - "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len, - base_vi->i_ino); - make_bad_inode(vi); - if (err != -ENOMEM) - NVolSetErrors(vol); - return err; -} - -/** - * ntfs_read_locked_index_inode - read an index inode from its base inode - * @base_vi: base inode - * @vi: index inode to read - * - * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the - * index inode described by @vi into memory from the base mft record described - * by @base_ni. - * - * ntfs_read_locked_index_inode() maps, pins and locks the base inode for - * reading and looks up the attributes relating to the index described by @vi - * before setting up the necessary fields in @vi as well as initializing the - * ntfs inode. - * - * Note, index inodes are essentially attribute inodes (NInoAttr() is true) - * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they - * are setup like directory inodes since directories are a special case of - * indices ao they need to be treated in much the same way. Most importantly, - * for small indices the index allocation attribute might not actually exist. - * However, the index root attribute always exists but this does not need to - * have an inode associated with it and this is why we define a new inode type - * index. Also, like for directories, we need to have an attribute inode for - * the bitmap attribute corresponding to the index allocation attribute and we - * can store this in the appropriate field of the inode, just like we do for - * normal directory inodes. - * - * Q: What locks are held when the function is called? - * A: i_state has I_NEW set, hence the inode is locked, also - * i_count is set to 1, so it is not going to go away - * - * Return 0 on success and -errno on error. In the error case, the inode will - * have had make_bad_inode() executed on it. - */ -static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) -{ - loff_t bvi_size; - ntfs_volume *vol = NTFS_SB(vi->i_sb); - ntfs_inode *ni, *base_ni, *bni; - struct inode *bvi; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - INDEX_ROOT *ir; - u8 *ir_end, *index_end; - int err = 0; - - ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); - ntfs_init_big_inode(vi); - ni = NTFS_I(vi); - base_ni = NTFS_I(base_vi); - /* Just mirror the values from the base inode. */ - vi->i_uid = base_vi->i_uid; - vi->i_gid = base_vi->i_gid; - set_nlink(vi, base_vi->i_nlink); - inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); - inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); - inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); - vi->i_generation = ni->seq_no = base_ni->seq_no; - /* Set inode type to zero but preserve permissions. */ - vi->i_mode = base_vi->i_mode & ~S_IFMT; - /* Map the mft record for the base inode. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (!ctx) { - err = -ENOMEM; - goto unm_err_out; - } - /* Find the index root attribute. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " - "missing."); - goto unm_err_out; - } - a = ctx->attr; - /* Set up the state. */ - if (unlikely(a->non_resident)) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident."); - goto unm_err_out; - } - /* Ensure the attribute name is placed before the value. */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu(a->data.resident.value_offset)))) { - ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed " - "after the attribute value."); - goto unm_err_out; - } - /* - * Compressed/encrypted/sparse index root is not allowed, except for - * directories of course but those are not dealt with here. - */ - if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED | - ATTR_IS_SPARSE)) { - ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index " - "root attribute."); - goto unm_err_out; - } - ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); - ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); - if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt."); - goto unm_err_out; - } - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - if (index_end > ir_end) { - ntfs_error(vi->i_sb, "Index is corrupt."); - goto unm_err_out; - } - if (ir->type) { - ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).", - le32_to_cpu(ir->type)); - goto unm_err_out; - } - ni->itype.index.collation_rule = ir->collation_rule; - ntfs_debug("Index collation rule is 0x%x.", - le32_to_cpu(ir->collation_rule)); - ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); - if (!is_power_of_2(ni->itype.index.block_size)) { - ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " - "two.", ni->itype.index.block_size); - goto unm_err_out; - } - if (ni->itype.index.block_size > PAGE_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE " - "(%ld) is not supported. Sorry.", - ni->itype.index.block_size, PAGE_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { - ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE " - "(%i) is not supported. Sorry.", - ni->itype.index.block_size, NTFS_BLOCK_SIZE); - err = -EOPNOTSUPP; - goto unm_err_out; - } - ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1; - /* Determine the size of a vcn in the index. */ - if (vol->cluster_size <= ni->itype.index.block_size) { - ni->itype.index.vcn_size = vol->cluster_size; - ni->itype.index.vcn_size_bits = vol->cluster_size_bits; - } else { - ni->itype.index.vcn_size = vol->sector_size; - ni->itype.index.vcn_size_bits = vol->sector_size_bits; - } - /* Check for presence of index allocation attribute. */ - if (!(ir->index.flags & LARGE_INDEX)) { - /* No index allocation. */ - vi->i_size = ni->initialized_size = ni->allocated_size = 0; - /* We are done with the mft record, so we release it. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - m = NULL; - ctx = NULL; - goto skip_large_index_stuff; - } /* LARGE_INDEX: Index allocation present. Setup state. */ - NInoSetIndexAllocPresent(ni); - /* Find index allocation attribute. */ - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "not present but $INDEX_ROOT " - "indicated it is."); - else - ntfs_error(vi->i_sb, "Failed to lookup " - "$INDEX_ALLOCATION attribute."); - goto unm_err_out; - } - a = ctx->attr; - if (!a->non_resident) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "resident."); - goto unm_err_out; - } - /* - * Ensure the attribute name is placed before the mapping pairs array. - */ - if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)))) { - ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is " - "placed after the mapping pairs array."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "encrypted."); - goto unm_err_out; - } - if (a->flags & ATTR_IS_SPARSE) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse."); - goto unm_err_out; - } - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " - "compressed."); - goto unm_err_out; - } - if (a->data.non_resident.lowest_vcn) { - ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION " - "attribute has non zero lowest_vcn."); - goto unm_err_out; - } - vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size); - /* - * We are done with the mft record, so we release it. Otherwise - * we would deadlock in ntfs_attr_iget(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - m = NULL; - ctx = NULL; - /* Get the index bitmap attribute inode. */ - bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len); - if (IS_ERR(bvi)) { - ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bvi); - goto unm_err_out; - } - bni = NTFS_I(bvi); - if (NInoCompressed(bni) || NInoEncrypted(bni) || - NInoSparse(bni)) { - ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or " - "encrypted and/or sparse."); - goto iput_unm_err_out; - } - /* Consistency check bitmap size vs. index allocation size. */ - bvi_size = i_size_read(bvi); - if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) { - ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for " - "index allocation (0x%llx).", bvi_size << 3, - vi->i_size); - goto iput_unm_err_out; - } - iput(bvi); -skip_large_index_stuff: - /* Setup the operations for this index inode. */ - vi->i_mapping->a_ops = &ntfs_mst_aops; - vi->i_blocks = ni->allocated_size >> 9; - /* - * Make sure the base inode doesn't go away and attach it to the - * index inode. - */ - igrab(base_vi); - ni->ext.base_ntfs_ino = base_ni; - ni->nr_extents = -1; - - ntfs_debug("Done."); - return 0; -iput_unm_err_out: - iput(bvi); -unm_err_out: - if (!err) - err = -EIO; - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); -err_out: - ntfs_error(vi->i_sb, "Failed with error code %i while reading index " - "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino, - ni->name_len); - make_bad_inode(vi); - if (err != -EOPNOTSUPP && err != -ENOMEM) - NVolSetErrors(vol); - return err; -} - -/* - * The MFT inode has special locking, so teach the lock validator - * about this by splitting off the locking rules of the MFT from - * the locking rules of other inodes. The MFT inode can never be - * accessed from the VFS side (or even internally), only by the - * map_mft functions. - */ -static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key; - -/** - * ntfs_read_inode_mount - special read_inode for mount time use only - * @vi: inode to read - * - * Read inode FILE_MFT at mount time, only called with super_block lock - * held from within the read_super() code path. - * - * This function exists because when it is called the page cache for $MFT/$DATA - * is not initialized and hence we cannot get at the contents of mft records - * by calling map_mft_record*(). - * - * Further it needs to cope with the circular references problem, i.e. cannot - * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because - * we do not know where the other extent mft records are yet and again, because - * we cannot call map_mft_record*() yet. Obviously this applies only when an - * attribute list is actually present in $MFT inode. - * - * We solve these problems by starting with the $DATA attribute before anything - * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each - * extent is found, we ntfs_mapping_pairs_decompress() including the implied - * ntfs_runlists_merge(). Each step of the iteration necessarily provides - * sufficient information for the next step to complete. - * - * This should work but there are two possible pit falls (see inline comments - * below), but only time will tell if they are real pits or just smoke... - */ -int ntfs_read_inode_mount(struct inode *vi) -{ - VCN next_vcn, last_vcn, highest_vcn; - s64 block; - struct super_block *sb = vi->i_sb; - ntfs_volume *vol = NTFS_SB(sb); - struct buffer_head *bh; - ntfs_inode *ni; - MFT_RECORD *m = NULL; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - unsigned int i, nr_blocks; - int err; - - ntfs_debug("Entering."); - - /* Initialize the ntfs specific part of @vi. */ - ntfs_init_big_inode(vi); - - ni = NTFS_I(vi); - - /* Setup the data attribute. It is special as it is mst protected. */ - NInoSetNonResident(ni); - NInoSetMstProtected(ni); - NInoSetSparseDisabled(ni); - ni->type = AT_DATA; - ni->name = NULL; - ni->name_len = 0; - /* - * This sets up our little cheat allowing us to reuse the async read io - * completion handler for directories. - */ - ni->itype.index.block_size = vol->mft_record_size; - ni->itype.index.block_size_bits = vol->mft_record_size_bits; - - /* Very important! Needed to be able to call map_mft_record*(). */ - vol->mft_ino = vi; - - /* Allocate enough memory to read the first mft record. */ - if (vol->mft_record_size > 64 * 1024) { - ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).", - vol->mft_record_size); - goto err_out; - } - i = vol->mft_record_size; - if (i < sb->s_blocksize) - i = sb->s_blocksize; - m = (MFT_RECORD*)ntfs_malloc_nofs(i); - if (!m) { - ntfs_error(sb, "Failed to allocate buffer for $MFT record 0."); - goto err_out; - } - - /* Determine the first block of the $MFT/$DATA attribute. */ - block = vol->mft_lcn << vol->cluster_size_bits >> - sb->s_blocksize_bits; - nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits; - if (!nr_blocks) - nr_blocks = 1; - - /* Load $MFT/$DATA's first mft record. */ - for (i = 0; i < nr_blocks; i++) { - bh = sb_bread(sb, block++); - if (!bh) { - ntfs_error(sb, "Device read failed."); - goto err_out; - } - memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data, - sb->s_blocksize); - brelse(bh); - } - - if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { - ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.", - le32_to_cpu(m->bytes_allocated), vol->mft_record_size); - goto err_out; - } - - /* Apply the mst fixups. */ - if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { - /* FIXME: Try to use the $MFTMirr now. */ - ntfs_error(sb, "MST fixup failed. $MFT is corrupt."); - goto err_out; - } - - /* Sanity check offset to the first attribute */ - if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { - ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", - le16_to_cpu(m->attrs_offset)); - goto err_out; - } - - /* Need this to sanity check attribute list references to $MFT. */ - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - - /* Provides read_folio() for map_mft_record(). */ - vi->i_mapping->a_ops = &ntfs_mst_aops; - - ctx = ntfs_attr_get_search_ctx(ni, m); - if (!ctx) { - err = -ENOMEM; - goto err_out; - } - - /* Find the attribute list attribute if present. */ - err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); - if (err) { - if (unlikely(err != -ENOENT)) { - ntfs_error(sb, "Failed to lookup attribute list " - "attribute. You should run chkdsk."); - goto put_err_out; - } - } else /* if (!err) */ { - ATTR_LIST_ENTRY *al_entry, *next_al_entry; - u8 *al_end; - static const char *es = " Not allowed. $MFT is corrupt. " - "You should run chkdsk."; - - ntfs_debug("Attribute list attribute found in $MFT."); - NInoSetAttrList(ni); - a = ctx->attr; - if (a->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(sb, "Attribute list attribute is " - "compressed.%s", es); - goto put_err_out; - } - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_IS_SPARSE) { - if (a->non_resident) { - ntfs_error(sb, "Non-resident attribute list " - "attribute is encrypted/" - "sparse.%s", es); - goto put_err_out; - } - ntfs_warning(sb, "Resident attribute list attribute " - "in $MFT system file is marked " - "encrypted/sparse which is not true. " - "However, Windows allows this and " - "chkdsk does not detect or correct it " - "so we will just ignore the invalid " - "flags and pretend they are not set."); - } - /* Now allocate memory for the attribute list. */ - ni->attr_list_size = (u32)ntfs_attr_size(a); - if (!ni->attr_list_size) { - ntfs_error(sb, "Attr_list_size is zero"); - goto put_err_out; - } - ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); - if (!ni->attr_list) { - ntfs_error(sb, "Not enough memory to allocate buffer " - "for attribute list."); - goto put_err_out; - } - if (a->non_resident) { - NInoSetAttrListNonResident(ni); - if (a->data.non_resident.lowest_vcn) { - ntfs_error(sb, "Attribute list has non zero " - "lowest_vcn. $MFT is corrupt. " - "You should run chkdsk."); - goto put_err_out; - } - /* Setup the runlist. */ - ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, - a, NULL); - if (IS_ERR(ni->attr_list_rl.rl)) { - err = PTR_ERR(ni->attr_list_rl.rl); - ni->attr_list_rl.rl = NULL; - ntfs_error(sb, "Mapping pairs decompression " - "failed with error code %i.", - -err); - goto put_err_out; - } - /* Now load the attribute list. */ - if ((err = load_attribute_list(vol, &ni->attr_list_rl, - ni->attr_list, ni->attr_list_size, - sle64_to_cpu(a->data. - non_resident.initialized_size)))) { - ntfs_error(sb, "Failed to load attribute list " - "attribute with error code %i.", - -err); - goto put_err_out; - } - } else /* if (!ctx.attr->non_resident) */ { - if ((u8*)a + le16_to_cpu( - a->data.resident.value_offset) + - le32_to_cpu( - a->data.resident.value_length) > - (u8*)ctx->mrec + vol->mft_record_size) { - ntfs_error(sb, "Corrupt attribute list " - "attribute."); - goto put_err_out; - } - /* Now copy the attribute list. */ - memcpy(ni->attr_list, (u8*)a + le16_to_cpu( - a->data.resident.value_offset), - le32_to_cpu( - a->data.resident.value_length)); - } - /* The attribute list is now setup in memory. */ - /* - * FIXME: I don't know if this case is actually possible. - * According to logic it is not possible but I have seen too - * many weird things in MS software to rely on logic... Thus we - * perform a manual search and make sure the first $MFT/$DATA - * extent is in the base inode. If it is not we abort with an - * error and if we ever see a report of this error we will need - * to do some magic in order to have the necessary mft record - * loaded and in the right place in the page cache. But - * hopefully logic will prevail and this never happens... - */ - al_entry = (ATTR_LIST_ENTRY*)ni->attr_list; - al_end = (u8*)al_entry + ni->attr_list_size; - for (;; al_entry = next_al_entry) { - /* Out of bounds check. */ - if ((u8*)al_entry < ni->attr_list || - (u8*)al_entry > al_end) - goto em_put_err_out; - /* Catch the end of the attribute list. */ - if ((u8*)al_entry == al_end) - goto em_put_err_out; - if (!al_entry->length) - goto em_put_err_out; - if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + - le16_to_cpu(al_entry->length) > al_end) - goto em_put_err_out; - next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + - le16_to_cpu(al_entry->length)); - if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA)) - goto em_put_err_out; - if (AT_DATA != al_entry->type) - continue; - /* We want an unnamed attribute. */ - if (al_entry->name_length) - goto em_put_err_out; - /* Want the first entry, i.e. lowest_vcn == 0. */ - if (al_entry->lowest_vcn) - goto em_put_err_out; - /* First entry has to be in the base mft record. */ - if (MREF_LE(al_entry->mft_reference) != vi->i_ino) { - /* MFT references do not match, logic fails. */ - ntfs_error(sb, "BUG: The first $DATA extent " - "of $MFT is not in the base " - "mft record. Please report " - "you saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - goto put_err_out; - } else { - /* Sequence numbers must match. */ - if (MSEQNO_LE(al_entry->mft_reference) != - ni->seq_no) - goto em_put_err_out; - /* Got it. All is ok. We can stop now. */ - break; - } - } - } - - ntfs_attr_reinit_search_ctx(ctx); - - /* Now load all attribute extents. */ - a = NULL; - next_vcn = last_vcn = highest_vcn = 0; - while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0, - ctx))) { - runlist_element *nrl; - - /* Cache the current attribute. */ - a = ctx->attr; - /* $MFT must be non-resident. */ - if (!a->non_resident) { - ntfs_error(sb, "$MFT must be non-resident but a " - "resident extent was found. $MFT is " - "corrupt. Run chkdsk."); - goto put_err_out; - } - /* $MFT must be uncompressed and unencrypted. */ - if (a->flags & ATTR_COMPRESSION_MASK || - a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_IS_SPARSE) { - ntfs_error(sb, "$MFT must be uncompressed, " - "non-sparse, and unencrypted but a " - "compressed/sparse/encrypted extent " - "was found. $MFT is corrupt. Run " - "chkdsk."); - goto put_err_out; - } - /* - * Decompress the mapping pairs array of this extent and merge - * the result into the existing runlist. No need for locking - * as we have exclusive access to the inode at this time and we - * are a mount in progress task, too. - */ - nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); - if (IS_ERR(nrl)) { - ntfs_error(sb, "ntfs_mapping_pairs_decompress() " - "failed with error code %ld. $MFT is " - "corrupt.", PTR_ERR(nrl)); - goto put_err_out; - } - ni->runlist.rl = nrl; - - /* Are we in the first extent? */ - if (!next_vcn) { - if (a->data.non_resident.lowest_vcn) { - ntfs_error(sb, "First extent of $DATA " - "attribute has non zero " - "lowest_vcn. $MFT is corrupt. " - "You should run chkdsk."); - goto put_err_out; - } - /* Get the last vcn in the $DATA attribute. */ - last_vcn = sle64_to_cpu( - a->data.non_resident.allocated_size) - >> vol->cluster_size_bits; - /* Fill in the inode size. */ - vi->i_size = sle64_to_cpu( - a->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu( - a->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - a->data.non_resident.allocated_size); - /* - * Verify the number of mft records does not exceed - * 2^32 - 1. - */ - if ((vi->i_size >> vol->mft_record_size_bits) >= - (1ULL << 32)) { - ntfs_error(sb, "$MFT is too big! Aborting."); - goto put_err_out; - } - /* - * We have got the first extent of the runlist for - * $MFT which means it is now relatively safe to call - * the normal ntfs_read_inode() function. - * Complete reading the inode, this will actually - * re-read the mft record for $MFT, this time entering - * it into the page cache with which we complete the - * kick start of the volume. It should be safe to do - * this now as the first extent of $MFT/$DATA is - * already known and we would hope that we don't need - * further extents in order to find the other - * attributes belonging to $MFT. Only time will tell if - * this is really the case. If not we will have to play - * magic at this point, possibly duplicating a lot of - * ntfs_read_inode() at this point. We will need to - * ensure we do enough of its work to be able to call - * ntfs_read_inode() on extents of $MFT/$DATA. But lets - * hope this never happens... - */ - ntfs_read_locked_inode(vi); - if (is_bad_inode(vi)) { - ntfs_error(sb, "ntfs_read_inode() of $MFT " - "failed. BUG or corrupt $MFT. " - "Run chkdsk and if no errors " - "are found, please report you " - "saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - ntfs_attr_put_search_ctx(ctx); - /* Revert to the safe super operations. */ - ntfs_free(m); - return -1; - } - /* - * Re-initialize some specifics about $MFT's inode as - * ntfs_read_inode() will have set up the default ones. - */ - /* Set uid and gid to root. */ - vi->i_uid = GLOBAL_ROOT_UID; - vi->i_gid = GLOBAL_ROOT_GID; - /* Regular file. No access for anyone. */ - vi->i_mode = S_IFREG; - /* No VFS initiated operations allowed for $MFT. */ - vi->i_op = &ntfs_empty_inode_ops; - vi->i_fop = &ntfs_empty_file_ops; - } - - /* Get the lowest vcn for the next extent. */ - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - next_vcn = highest_vcn + 1; - - /* Only one extent or error, which we catch below. */ - if (next_vcn <= 0) - break; - - /* Avoid endless loops due to corruption. */ - if (next_vcn < sle64_to_cpu( - a->data.non_resident.lowest_vcn)) { - ntfs_error(sb, "$MFT has corrupt attribute list " - "attribute. Run chkdsk."); - goto put_err_out; - } - } - if (err != -ENOENT) { - ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. " - "$MFT is corrupt. Run chkdsk."); - goto put_err_out; - } - if (!a) { - ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is " - "corrupt. Run chkdsk."); - goto put_err_out; - } - if (highest_vcn && highest_vcn != last_vcn - 1) { - ntfs_error(sb, "Failed to load the complete runlist for " - "$MFT/$DATA. Driver bug or corrupt $MFT. " - "Run chkdsk."); - ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx", - (unsigned long long)highest_vcn, - (unsigned long long)last_vcn - 1); - goto put_err_out; - } - ntfs_attr_put_search_ctx(ctx); - ntfs_debug("Done."); - ntfs_free(m); - - /* - * Split the locking rules of the MFT inode from the - * locking rules of other inodes: - */ - lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key); - lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key); - - return 0; - -em_put_err_out: - ntfs_error(sb, "Couldn't find first extent of $DATA attribute in " - "attribute list. $MFT is corrupt. Run chkdsk."); -put_err_out: - ntfs_attr_put_search_ctx(ctx); -err_out: - ntfs_error(sb, "Failed. Marking inode as bad."); - make_bad_inode(vi); - ntfs_free(m); - return -1; -} - -static void __ntfs_clear_inode(ntfs_inode *ni) -{ - /* Free all alocated memory. */ - down_write(&ni->runlist.lock); - if (ni->runlist.rl) { - ntfs_free(ni->runlist.rl); - ni->runlist.rl = NULL; - } - up_write(&ni->runlist.lock); - - if (ni->attr_list) { - ntfs_free(ni->attr_list); - ni->attr_list = NULL; - } - - down_write(&ni->attr_list_rl.lock); - if (ni->attr_list_rl.rl) { - ntfs_free(ni->attr_list_rl.rl); - ni->attr_list_rl.rl = NULL; - } - up_write(&ni->attr_list_rl.lock); - - if (ni->name_len && ni->name != I30) { - /* Catch bugs... */ - BUG_ON(!ni->name); - kfree(ni->name); - } -} - -void ntfs_clear_extent_inode(ntfs_inode *ni) -{ - ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); - - BUG_ON(NInoAttr(ni)); - BUG_ON(ni->nr_extents != -1); - -#ifdef NTFS_RW - if (NInoDirty(ni)) { - if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino))) - ntfs_error(ni->vol->sb, "Clearing dirty extent inode! " - "Losing data! This is a BUG!!!"); - // FIXME: Do something!!! - } -#endif /* NTFS_RW */ - - __ntfs_clear_inode(ni); - - /* Bye, bye... */ - ntfs_destroy_extent_inode(ni); -} - -/** - * ntfs_evict_big_inode - clean up the ntfs specific part of an inode - * @vi: vfs inode pending annihilation - * - * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode() - * is called, which deallocates all memory belonging to the NTFS specific part - * of the inode and returns. - * - * If the MFT record is dirty, we commit it before doing anything else. - */ -void ntfs_evict_big_inode(struct inode *vi) -{ - ntfs_inode *ni = NTFS_I(vi); - - truncate_inode_pages_final(&vi->i_data); - clear_inode(vi); - -#ifdef NTFS_RW - if (NInoDirty(ni)) { - bool was_bad = (is_bad_inode(vi)); - - /* Committing the inode also commits all extent inodes. */ - ntfs_commit_inode(vi); - - if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) { - ntfs_error(vi->i_sb, "Failed to commit dirty inode " - "0x%lx. Losing data!", vi->i_ino); - // FIXME: Do something!!! - } - } -#endif /* NTFS_RW */ - - /* No need to lock at this stage as no one else has a reference. */ - if (ni->nr_extents > 0) { - int i; - - for (i = 0; i < ni->nr_extents; i++) - ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]); - kfree(ni->ext.extent_ntfs_inos); - } - - __ntfs_clear_inode(ni); - - if (NInoAttr(ni)) { - /* Release the base inode if we are holding it. */ - if (ni->nr_extents == -1) { - iput(VFS_I(ni->ext.base_ntfs_ino)); - ni->nr_extents = 0; - ni->ext.base_ntfs_ino = NULL; - } - } - BUG_ON(ni->page); - if (!atomic_dec_and_test(&ni->count)) - BUG(); - return; -} - -/** - * ntfs_show_options - show mount options in /proc/mounts - * @sf: seq_file in which to write our mount options - * @root: root of the mounted tree whose mount options to display - * - * Called by the VFS once for each mounted ntfs volume when someone reads - * /proc/mounts in order to display the NTFS specific mount options of each - * mount. The mount options of fs specified by @root are written to the seq file - * @sf and success is returned. - */ -int ntfs_show_options(struct seq_file *sf, struct dentry *root) -{ - ntfs_volume *vol = NTFS_SB(root->d_sb); - int i; - - seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid)); - seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid)); - if (vol->fmask == vol->dmask) - seq_printf(sf, ",umask=0%o", vol->fmask); - else { - seq_printf(sf, ",fmask=0%o", vol->fmask); - seq_printf(sf, ",dmask=0%o", vol->dmask); - } - seq_printf(sf, ",nls=%s", vol->nls_map->charset); - if (NVolCaseSensitive(vol)) - seq_printf(sf, ",case_sensitive"); - if (NVolShowSystemFiles(vol)) - seq_printf(sf, ",show_sys_files"); - if (!NVolSparseEnabled(vol)) - seq_printf(sf, ",disable_sparse"); - for (i = 0; on_errors_arr[i].val; i++) { - if (on_errors_arr[i].val & vol->on_errors) - seq_printf(sf, ",errors=%s", on_errors_arr[i].str); - } - seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier); - return 0; -} - -#ifdef NTFS_RW - -static const char *es = " Leaving inconsistent metadata. Unmount and run " - "chkdsk."; - -/** - * ntfs_truncate - called when the i_size of an ntfs inode is changed - * @vi: inode for which the i_size was changed - * - * We only support i_size changes for normal files at present, i.e. not - * compressed and not encrypted. This is enforced in ntfs_setattr(), see - * below. - * - * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and - * that the change is allowed. - * - * This implies for us that @vi is a file inode rather than a directory, index, - * or attribute inode as well as that @vi is a base inode. - * - * Returns 0 on success or -errno on error. - * - * Called with ->i_mutex held. - */ -int ntfs_truncate(struct inode *vi) -{ - s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size; - VCN highest_vcn; - unsigned long flags; - ntfs_inode *base_ni, *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - const char *te = " Leaving file length out of sync with i_size."; - int err, mp_size, size_change, alloc_change; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - BUG_ON(NInoAttr(ni)); - BUG_ON(S_ISDIR(vi->i_mode)); - BUG_ON(NInoMstProtected(ni)); - BUG_ON(ni->nr_extents < 0); -retry_truncate: - /* - * Lock the runlist for writing and map the mft record to ensure it is - * safe to mess with the attribute runlist and sizes. - */ - down_write(&ni->runlist.lock); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx " - "(error code %d).%s", vi->i_ino, err, te); - ctx = NULL; - m = NULL; - goto old_bad_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - ntfs_error(vi->i_sb, "Failed to allocate a search context for " - "inode 0x%lx (not enough memory).%s", - vi->i_ino, te); - err = -ENOMEM; - goto old_bad_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(vi->i_sb, "Open attribute is missing from " - "mft record. Inode 0x%lx is corrupt. " - "Run chkdsk.%s", vi->i_ino, te); - err = -EIO; - } else - ntfs_error(vi->i_sb, "Failed to lookup attribute in " - "inode 0x%lx (error code %d).%s", - vi->i_ino, err, te); - goto old_bad_out; - } - m = ctx->mrec; - a = ctx->attr; - /* - * The i_size of the vfs inode is the new size for the attribute value. - */ - new_size = i_size_read(vi); - /* The current size of the attribute value is the old size. */ - old_size = ntfs_attr_size(a); - /* Calculate the new allocated size. */ - if (NInoNonResident(ni)) - new_alloc_size = (new_size + vol->cluster_size - 1) & - ~(s64)vol->cluster_size_mask; - else - new_alloc_size = (new_size + 7) & ~7; - /* The current allocated size is the old allocated size. */ - read_lock_irqsave(&ni->size_lock, flags); - old_alloc_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * The change in the file size. This will be 0 if no change, >0 if the - * size is growing, and <0 if the size is shrinking. - */ - size_change = -1; - if (new_size - old_size >= 0) { - size_change = 1; - if (new_size == old_size) - size_change = 0; - } - /* As above for the allocated size. */ - alloc_change = -1; - if (new_alloc_size - old_alloc_size >= 0) { - alloc_change = 1; - if (new_alloc_size == old_alloc_size) - alloc_change = 0; - } - /* - * If neither the size nor the allocation are being changed there is - * nothing to do. - */ - if (!size_change && !alloc_change) - goto unm_done; - /* If the size is changing, check if new size is allowed in $AttrDef. */ - if (size_change) { - err = ntfs_attr_size_bounds_check(vol, ni->type, new_size); - if (unlikely(err)) { - if (err == -ERANGE) { - ntfs_error(vol->sb, "Truncate would cause the " - "inode 0x%lx to %simum size " - "for its attribute type " - "(0x%x). Aborting truncate.", - vi->i_ino, - new_size > old_size ? "exceed " - "the max" : "go under the min", - le32_to_cpu(ni->type)); - err = -EFBIG; - } else { - ntfs_error(vol->sb, "Inode 0x%lx has unknown " - "attribute type 0x%x. " - "Aborting truncate.", - vi->i_ino, - le32_to_cpu(ni->type)); - err = -EIO; - } - /* Reset the vfs inode size to the old size. */ - i_size_write(vi, old_size); - goto err_out; - } - } - if (NInoCompressed(ni) || NInoEncrypted(ni)) { - ntfs_warning(vi->i_sb, "Changes in inode size are not " - "supported yet for %s files, ignoring.", - NInoCompressed(ni) ? "compressed" : - "encrypted"); - err = -EOPNOTSUPP; - goto bad_out; - } - if (a->non_resident) - goto do_non_resident_truncate; - BUG_ON(NInoNonResident(ni)); - /* Resize the attribute record to best fit the new attribute size. */ - if (new_size < vol->mft_record_size && - !ntfs_resident_attr_value_resize(m, a, new_size)) { - /* The resize succeeded! */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - write_lock_irqsave(&ni->size_lock, flags); - /* Update the sizes in the ntfs inode and all is done. */ - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - /* - * Note ntfs_resident_attr_value_resize() has already done any - * necessary data clearing in the attribute record. When the - * file is being shrunk vmtruncate() will already have cleared - * the top part of the last partial page, i.e. since this is - * the resident case this is the page with index 0. However, - * when the file is being expanded, the page cache page data - * between the old data_size, i.e. old_size, and the new_size - * has not been zeroed. Fortunately, we do not need to zero it - * either since on one hand it will either already be zero due - * to both read_folio and writepage clearing partial page data - * beyond i_size in which case there is nothing to do or in the - * case of the file being mmap()ped at the same time, POSIX - * specifies that the behaviour is unspecified thus we do not - * have to do anything. This means that in our implementation - * in the rare case that the file is mmap()ped and a write - * occurred into the mmap()ped region just beyond the file size - * and writepage has not yet been called to write out the page - * (which would clear the area beyond the file size) and we now - * extend the file size to incorporate this dirty region - * outside the file size, a write of the page would result in - * this data being written to disk instead of being cleared. - * Given both POSIX and the Linux mmap(2) man page specify that - * this corner case is undefined, we choose to leave it like - * that as this is much simpler for us as we cannot lock the - * relevant page now since we are holding too many ntfs locks - * which would result in a lock reversal deadlock. - */ - ni->initialized_size = new_size; - write_unlock_irqrestore(&ni->size_lock, flags); - goto unm_done; - } - /* If the above resize failed, this must be an attribute extension. */ - BUG_ON(size_change < 0); - /* - * We have to drop all the locks so we can call - * ntfs_attr_make_non_resident(). This could be optimised by try- - * locking the first page cache page and only if that fails dropping - * the locks, locking the page, and redoing all the locking and - * lookups. While this would be a huge optimisation, it is not worth - * it as this is definitely a slow code path as it only ever can happen - * once for any given file. - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * Not enough space in the mft record, try to make the attribute - * non-resident and if successful restart the truncation process. - */ - err = ntfs_attr_make_non_resident(ni, old_size); - if (likely(!err)) - goto retry_truncate; - /* - * Could not make non-resident. If this is due to this not being - * permitted for this attribute type or there not being enough space, - * try to make other attributes non-resident. Otherwise fail. - */ - if (unlikely(err != -EPERM && err != -ENOSPC)) { - ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute " - "type 0x%x, because the conversion from " - "resident to non-resident attribute failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - goto conv_err_out; - } - /* TODO: Not implemented from here, abort. */ - if (err == -ENOSPC) - ntfs_error(vol->sb, "Not enough space in the mft record/on " - "disk for the non-resident attribute value. " - "This case is not implemented yet."); - else /* if (err == -EPERM) */ - ntfs_error(vol->sb, "This attribute type may not be " - "non-resident. This case is not implemented " - "yet."); - err = -EOPNOTSUPP; - goto conv_err_out; -#if 0 - // TODO: Attempt to make other attributes non-resident. - if (!err) - goto do_resident_extend; - /* - * Both the attribute list attribute and the standard information - * attribute must remain in the base inode. Thus, if this is one of - * these attributes, we have to try to move other attributes out into - * extent mft records instead. - */ - if (ni->type == AT_ATTRIBUTE_LIST || - ni->type == AT_STANDARD_INFORMATION) { - // TODO: Attempt to move other attributes into extent mft - // records. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - goto err_out; - } - // TODO: Attempt to move this attribute to an extent mft record, but - // only if it is not already the only attribute in an mft record in - // which case there would be nothing to gain. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - /* There is nothing we can do to make enough space. )-: */ - goto err_out; -#endif -do_non_resident_truncate: - BUG_ON(!NInoNonResident(ni)); - if (alloc_change < 0) { - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - if (highest_vcn > 0 && - old_alloc_size >> vol->cluster_size_bits > - highest_vcn + 1) { - /* - * This attribute has multiple extents. Not yet - * supported. - */ - ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, " - "attribute type 0x%x, because the " - "attribute is highly fragmented (it " - "consists of multiple extents) and " - "this case is not implemented yet.", - vi->i_ino, - (unsigned)le32_to_cpu(ni->type)); - err = -EOPNOTSUPP; - goto bad_out; - } - } - /* - * If the size is shrinking, need to reduce the initialized_size and - * the data_size before reducing the allocation. - */ - if (size_change < 0) { - /* - * Make the valid size smaller (i_size is already up-to-date). - */ - write_lock_irqsave(&ni->size_lock, flags); - if (new_size < ni->initialized_size) { - ni->initialized_size = new_size; - a->data.non_resident.initialized_size = - cpu_to_sle64(new_size); - } - a->data.non_resident.data_size = cpu_to_sle64(new_size); - write_unlock_irqrestore(&ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* If the allocated size is not changing, we are done. */ - if (!alloc_change) - goto unm_done; - /* - * If the size is shrinking it makes no sense for the - * allocation to be growing. - */ - BUG_ON(alloc_change > 0); - } else /* if (size_change >= 0) */ { - /* - * The file size is growing or staying the same but the - * allocation can be shrinking, growing or staying the same. - */ - if (alloc_change > 0) { - /* - * We need to extend the allocation and possibly update - * the data size. If we are updating the data size, - * since we are not touching the initialized_size we do - * not need to worry about the actual data on disk. - * And as far as the page cache is concerned, there - * will be no pages beyond the old data size and any - * partial region in the last page between the old and - * new data size (or the end of the page if the new - * data size is outside the page) does not need to be - * modified as explained above for the resident - * attribute truncate case. To do this, we simply drop - * the locks we hold and leave all the work to our - * friendly helper ntfs_attr_extend_allocation(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - err = ntfs_attr_extend_allocation(ni, new_size, - size_change > 0 ? new_size : -1, -1); - /* - * ntfs_attr_extend_allocation() will have done error - * output already. - */ - goto done; - } - if (!alloc_change) - goto alloc_done; - } - /* alloc_change < 0 */ - /* Free the clusters. */ - nr_freed = ntfs_cluster_free(ni, new_alloc_size >> - vol->cluster_size_bits, -1, ctx); - m = ctx->mrec; - a = ctx->attr; - if (unlikely(nr_freed < 0)) { - ntfs_error(vol->sb, "Failed to release cluster(s) (error code " - "%lli). Unmount and run chkdsk to recover " - "the lost cluster(s).", (long long)nr_freed); - NVolSetErrors(vol); - nr_freed = 0; - } - /* Truncate the runlist. */ - err = ntfs_rl_truncate_nolock(vol, &ni->runlist, - new_alloc_size >> vol->cluster_size_bits); - /* - * If the runlist truncation failed and/or the search context is no - * longer valid, we cannot resize the attribute record or build the - * mapping pairs array thus we mark the inode bad so that no access to - * the freed clusters can happen. - */ - if (unlikely(err || IS_ERR(m))) { - ntfs_error(vol->sb, "Failed to %s (error code %li).%s", - IS_ERR(m) ? - "restore attribute search context" : - "truncate attribute runlist", - IS_ERR(m) ? PTR_ERR(m) : err, es); - err = -EIO; - goto bad_out; - } - /* Get the size for the shrunk mapping pairs array for the runlist. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1); - if (unlikely(mp_size <= 0)) { - ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " - "attribute type 0x%x, because determining the " - "size for the mapping pairs failed with error " - "code %i.%s", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), mp_size, es); - err = -EIO; - goto bad_out; - } - /* - * Shrink the attribute record for the new mapping pairs array. Note, - * this cannot fail since we are making the attribute smaller thus by - * definition there is enough space to do so. - */ - err = ntfs_attr_record_resize(m, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - BUG_ON(err); - /* - * Generate the mapping pairs array directly into the attribute record. - */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, ni->runlist.rl, 0, -1, NULL); - if (unlikely(err)) { - ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " - "attribute type 0x%x, because building the " - "mapping pairs failed with error code %i.%s", - vi->i_ino, (unsigned)le32_to_cpu(ni->type), - err, es); - err = -EIO; - goto bad_out; - } - /* Update the allocated/compressed size as well as the highest vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> - vol->cluster_size_bits) - 1); - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); - if (NInoSparse(ni) || NInoCompressed(ni)) { - if (nr_freed) { - ni->itype.compressed.size -= nr_freed << - vol->cluster_size_bits; - BUG_ON(ni->itype.compressed.size < 0); - a->data.non_resident.compressed_size = cpu_to_sle64( - ni->itype.compressed.size); - vi->i_blocks = ni->itype.compressed.size >> 9; - } - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); - /* - * We have shrunk the allocation. If this is a shrinking truncate we - * have already dealt with the initialized_size and the data_size above - * and we are done. If the truncate is only changing the allocation - * and not the data_size, we are also done. If this is an extending - * truncate, need to extend the data_size now which is ensured by the - * fact that @size_change is positive. - */ -alloc_done: - /* - * If the size is growing, need to update it now. If it is shrinking, - * we have already updated it above (before the allocation change). - */ - if (size_change > 0) - a->data.non_resident.data_size = cpu_to_sle64(new_size); - /* Ensure the modified mft record is written out. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -unm_done: - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -done: - /* Update the mtime and ctime on the base inode. */ - /* normally ->truncate shouldn't update ctime or mtime, - * but ntfs did before so it got a copy & paste version - * of file_update_time. one day someone should fix this - * for real. - */ - if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { - struct timespec64 now = current_time(VFS_I(base_ni)); - struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni)); - struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni)); - int sync_it = 0; - - if (!timespec64_equal(&mtime, &now) || - !timespec64_equal(&ctime, &now)) - sync_it = 1; - inode_set_ctime_to_ts(VFS_I(base_ni), now); - inode_set_mtime_to_ts(VFS_I(base_ni), now); - - if (sync_it) - mark_inode_dirty_sync(VFS_I(base_ni)); - } - - if (likely(!err)) { - NInoClearTruncateFailed(ni); - ntfs_debug("Done."); - } - return err; -old_bad_out: - old_size = -1; -bad_out: - if (err != -ENOMEM && err != -EOPNOTSUPP) - NVolSetErrors(vol); - if (err != -EOPNOTSUPP) - NInoSetTruncateFailed(ni); - else if (old_size >= 0) - i_size_write(vi, old_size); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -out: - ntfs_debug("Failed. Returning error code %i.", err); - return err; -conv_err_out: - if (err != -ENOMEM && err != -EOPNOTSUPP) - NVolSetErrors(vol); - if (err != -EOPNOTSUPP) - NInoSetTruncateFailed(ni); - else - i_size_write(vi, old_size); - goto out; -} - -/** - * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value - * @vi: inode for which the i_size was changed - * - * Wrapper for ntfs_truncate() that has no return value. - * - * See ntfs_truncate() description above for details. - */ -#ifdef NTFS_RW -void ntfs_truncate_vfs(struct inode *vi) { - ntfs_truncate(vi); -} -#endif - -/** - * ntfs_setattr - called from notify_change() when an attribute is being changed - * @idmap: idmap of the mount the inode was found from - * @dentry: dentry whose attributes to change - * @attr: structure describing the attributes and the changes - * - * We have to trap VFS attempts to truncate the file described by @dentry as - * soon as possible, because we do not implement changes in i_size yet. So we - * abort all i_size changes here. - * - * We also abort all changes of user, group, and mode as we do not implement - * the NTFS ACLs yet. - * - * Called with ->i_mutex held. - */ -int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr) -{ - struct inode *vi = d_inode(dentry); - int err; - unsigned int ia_valid = attr->ia_valid; - - err = setattr_prepare(&nop_mnt_idmap, dentry, attr); - if (err) - goto out; - /* We do not support NTFS ACLs yet. */ - if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { - ntfs_warning(vi->i_sb, "Changes in user/group/mode are not " - "supported yet, ignoring."); - err = -EOPNOTSUPP; - goto out; - } - if (ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(vi)) { - ntfs_inode *ni = NTFS_I(vi); - /* - * FIXME: For now we do not support resizing of - * compressed or encrypted files yet. - */ - if (NInoCompressed(ni) || NInoEncrypted(ni)) { - ntfs_warning(vi->i_sb, "Changes in inode size " - "are not supported yet for " - "%s files, ignoring.", - NInoCompressed(ni) ? - "compressed" : "encrypted"); - err = -EOPNOTSUPP; - } else { - truncate_setsize(vi, attr->ia_size); - ntfs_truncate_vfs(vi); - } - if (err || ia_valid == ATTR_SIZE) - goto out; - } else { - /* - * We skipped the truncate but must still update - * timestamps. - */ - ia_valid |= ATTR_MTIME | ATTR_CTIME; - } - } - if (ia_valid & ATTR_ATIME) - inode_set_atime_to_ts(vi, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - inode_set_mtime_to_ts(vi, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - inode_set_ctime_to_ts(vi, attr->ia_ctime); - mark_inode_dirty(vi); -out: - return err; -} - -/** - * __ntfs_write_inode - write out a dirty inode - * @vi: inode to write out - * @sync: if true, write out synchronously - * - * Write out a dirty inode to disk including any extent inodes if present. - * - * If @sync is true, commit the inode to disk and wait for io completion. This - * is done using write_mft_record(). - * - * If @sync is false, just schedule the write to happen but do not wait for i/o - * completion. In 2.6 kernels, scheduling usually happens just by virtue of - * marking the page (and in this case mft record) dirty but we do not implement - * this yet as write_mft_record() largely ignores the @sync parameter and - * always performs synchronous writes. - * - * Return 0 on success and -errno on error. - */ -int __ntfs_write_inode(struct inode *vi, int sync) -{ - sle64 nt; - ntfs_inode *ni = NTFS_I(vi); - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - STANDARD_INFORMATION *si; - int err = 0; - bool modified = false; - - ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "", - vi->i_ino); - /* - * Dirty attribute inodes are written via their real inodes so just - * clean them here. Access time updates are taken care off when the - * real inode is written. - */ - if (NInoAttr(ni)) { - NInoClearDirty(ni); - ntfs_debug("Done."); - return 0; - } - /* Map, pin, and lock the mft record belonging to the inode. */ - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - /* Update the access times in the standard information attribute. */ - ctx = ntfs_attr_get_search_ctx(ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto unm_err_out; - } - err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - ntfs_attr_put_search_ctx(ctx); - goto unm_err_out; - } - si = (STANDARD_INFORMATION*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - /* Update the access times if they have changed. */ - nt = utc2ntfs(inode_get_mtime(vi)); - if (si->last_data_change_time != nt) { - ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, (long long) - sle64_to_cpu(si->last_data_change_time), - (long long)sle64_to_cpu(nt)); - si->last_data_change_time = nt; - modified = true; - } - nt = utc2ntfs(inode_get_ctime(vi)); - if (si->last_mft_change_time != nt) { - ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, (long long) - sle64_to_cpu(si->last_mft_change_time), - (long long)sle64_to_cpu(nt)); - si->last_mft_change_time = nt; - modified = true; - } - nt = utc2ntfs(inode_get_atime(vi)); - if (si->last_access_time != nt) { - ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, - (long long)sle64_to_cpu(si->last_access_time), - (long long)sle64_to_cpu(nt)); - si->last_access_time = nt; - modified = true; - } - /* - * If we just modified the standard information attribute we need to - * mark the mft record it is in dirty. We do this manually so that - * mark_inode_dirty() is not called which would redirty the inode and - * hence result in an infinite loop of trying to write the inode. - * There is no need to mark the base inode nor the base mft record - * dirty, since we are going to write this mft record below in any case - * and the base mft record may actually not have been modified so it - * might not need to be written out. - * NOTE: It is not a problem when the inode for $MFT itself is being - * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES - * on the $MFT inode and hence __ntfs_write_inode() will not be - * re-invoked because of it which in turn is ok since the dirtied mft - * record will be cleaned and written out to disk below, i.e. before - * this function returns. - */ - if (modified) { - flush_dcache_mft_record_page(ctx->ntfs_ino); - if (!NInoTestSetDirty(ctx->ntfs_ino)) - mark_ntfs_record_dirty(ctx->ntfs_ino->page, - ctx->ntfs_ino->page_ofs); - } - ntfs_attr_put_search_ctx(ctx); - /* Now the access times are updated, write the base mft record. */ - if (NInoDirty(ni)) - err = write_mft_record(ni, m, sync); - /* Write all attached extent mft records. */ - mutex_lock(&ni->extent_lock); - if (ni->nr_extents > 0) { - ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos; - int i; - - ntfs_debug("Writing %i extent inodes.", ni->nr_extents); - for (i = 0; i < ni->nr_extents; i++) { - ntfs_inode *tni = extent_nis[i]; - - if (NInoDirty(tni)) { - MFT_RECORD *tm = map_mft_record(tni); - int ret; - - if (IS_ERR(tm)) { - if (!err || err == -ENOMEM) - err = PTR_ERR(tm); - continue; - } - ret = write_mft_record(tni, tm, sync); - unmap_mft_record(tni); - if (unlikely(ret)) { - if (!err || err == -ENOMEM) - err = ret; - } - } - } - } - mutex_unlock(&ni->extent_lock); - unmap_mft_record(ni); - if (unlikely(err)) - goto err_out; - ntfs_debug("Done."); - return 0; -unm_err_out: - unmap_mft_record(ni); -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Not enough memory to write inode. " - "Marking the inode dirty again, so the VFS " - "retries later."); - mark_inode_dirty(vi); - } else { - ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err); - NVolSetErrors(ni->vol); - } - return err; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h deleted file mode 100644 index 147ef4ddb691..000000000000 --- a/fs/ntfs/inode.h +++ /dev/null @@ -1,310 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of - * the Linux-NTFS project. - * - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_INODE_H -#define _LINUX_NTFS_INODE_H - -#include <linux/atomic.h> - -#include <linux/fs.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <linux/mutex.h> -#include <linux/seq_file.h> - -#include "layout.h" -#include "volume.h" -#include "types.h" -#include "runlist.h" -#include "debug.h" - -typedef struct _ntfs_inode ntfs_inode; - -/* - * The NTFS in-memory inode structure. It is just used as an extension to the - * fields already provided in the VFS inode. - */ -struct _ntfs_inode { - rwlock_t size_lock; /* Lock serializing access to inode sizes. */ - s64 initialized_size; /* Copy from the attribute record. */ - s64 allocated_size; /* Copy from the attribute record. */ - unsigned long state; /* NTFS specific flags describing this inode. - See ntfs_inode_state_bits below. */ - unsigned long mft_no; /* Number of the mft record / inode. */ - u16 seq_no; /* Sequence number of the mft record. */ - atomic_t count; /* Inode reference count for book keeping. */ - ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */ - /* - * If NInoAttr() is true, the below fields describe the attribute which - * this fake inode belongs to. The actual inode of this attribute is - * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see - * below). For real inodes, we also set the type (AT_DATA for files and - * AT_INDEX_ALLOCATION for directories), with the name = NULL and - * name_len = 0 for files and name = I30 (global constant) and - * name_len = 4 for directories. - */ - ATTR_TYPE type; /* Attribute type of this fake inode. */ - ntfschar *name; /* Attribute name of this fake inode. */ - u32 name_len; /* Attribute name length of this fake inode. */ - runlist runlist; /* If state has the NI_NonResident bit set, - the runlist of the unnamed data attribute - (if a file) or of the index allocation - attribute (directory) or of the attribute - described by the fake inode (if NInoAttr()). - If runlist.rl is NULL, the runlist has not - been read in yet or has been unmapped. If - NI_NonResident is clear, the attribute is - resident (file and fake inode) or there is - no $I30 index allocation attribute - (small directory). In the latter case - runlist.rl is always NULL.*/ - /* - * The following fields are only valid for real inodes and extent - * inodes. - */ - struct mutex mrec_lock; /* Lock for serializing access to the - mft record belonging to this inode. */ - struct page *page; /* The page containing the mft record of the - inode. This should only be touched by the - (un)map_mft_record*() functions. */ - int page_ofs; /* Offset into the page at which the mft record - begins. This should only be touched by the - (un)map_mft_record*() functions. */ - /* - * Attribute list support (only for use by the attribute lookup - * functions). Setup during read_inode for all inodes with attribute - * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is - * further only valid if NI_AttrListNonResident is set. - */ - u32 attr_list_size; /* Length of attribute list value in bytes. */ - u8 *attr_list; /* Attribute list value itself. */ - runlist attr_list_rl; /* Run list for the attribute list value. */ - union { - struct { /* It is a directory, $MFT, or an index inode. */ - u32 block_size; /* Size of an index block. */ - u32 vcn_size; /* Size of a vcn in this - index. */ - COLLATION_RULE collation_rule; /* The collation rule - for the index. */ - u8 block_size_bits; /* Log2 of the above. */ - u8 vcn_size_bits; /* Log2 of the above. */ - } index; - struct { /* It is a compressed/sparse file/attribute inode. */ - s64 size; /* Copy of compressed_size from - $DATA. */ - u32 block_size; /* Size of a compression block - (cb). */ - u8 block_size_bits; /* Log2 of the size of a cb. */ - u8 block_clusters; /* Number of clusters per cb. */ - } compressed; - } itype; - struct mutex extent_lock; /* Lock for accessing/modifying the - below . */ - s32 nr_extents; /* For a base mft record, the number of attached extent - inodes (0 if none), for extent records and for fake - inodes describing an attribute this is -1. */ - union { /* This union is only used if nr_extents != 0. */ - ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of - the ntfs inodes of the extent - mft records belonging to - this base inode which have - been loaded. */ - ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the - ntfs inode of the base mft - record. For fake inodes, the - real (base) inode to which - the attribute belongs. */ - } ext; -}; - -/* - * Defined bits for the state field in the ntfs_inode structure. - * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only - */ -typedef enum { - NI_Dirty, /* 1: Mft record needs to be written to disk. */ - NI_AttrList, /* 1: Mft record contains an attribute list. */ - NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies - NI_AttrList is set. */ - - NI_Attr, /* 1: Fake inode for attribute i/o. - 0: Real inode or extent inode. */ - - NI_MstProtected, /* 1: Attribute is protected by MST fixups. - 0: Attribute is not protected by fixups. */ - NI_NonResident, /* 1: Unnamed data attr is non-resident (f). - 1: Attribute is non-resident (a). */ - NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is - present (d). */ - NI_Compressed, /* 1: Unnamed data attr is compressed (f). - 1: Create compressed files by default (d). - 1: Attribute is compressed (a). */ - NI_Encrypted, /* 1: Unnamed data attr is encrypted (f). - 1: Create encrypted files by default (d). - 1: Attribute is encrypted (a). */ - NI_Sparse, /* 1: Unnamed data attr is sparse (f). - 1: Create sparse files by default (d). - 1: Attribute is sparse (a). */ - NI_SparseDisabled, /* 1: May not create sparse regions. */ - NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */ -} ntfs_inode_state_bits; - -/* - * NOTE: We should be adding dirty mft records to a list somewhere and they - * should be independent of the (ntfs/vfs) inode structure so that an inode can - * be removed but the record can be left dirty for syncing later. - */ - -/* - * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo() - * functions. - */ -#define NINO_FNS(flag) \ -static inline int NIno##flag(ntfs_inode *ni) \ -{ \ - return test_bit(NI_##flag, &(ni)->state); \ -} \ -static inline void NInoSet##flag(ntfs_inode *ni) \ -{ \ - set_bit(NI_##flag, &(ni)->state); \ -} \ -static inline void NInoClear##flag(ntfs_inode *ni) \ -{ \ - clear_bit(NI_##flag, &(ni)->state); \ -} - -/* - * As above for NInoTestSetFoo() and NInoTestClearFoo(). - */ -#define TAS_NINO_FNS(flag) \ -static inline int NInoTestSet##flag(ntfs_inode *ni) \ -{ \ - return test_and_set_bit(NI_##flag, &(ni)->state); \ -} \ -static inline int NInoTestClear##flag(ntfs_inode *ni) \ -{ \ - return test_and_clear_bit(NI_##flag, &(ni)->state); \ -} - -/* Emit the ntfs inode bitops functions. */ -NINO_FNS(Dirty) -TAS_NINO_FNS(Dirty) -NINO_FNS(AttrList) -NINO_FNS(AttrListNonResident) -NINO_FNS(Attr) -NINO_FNS(MstProtected) -NINO_FNS(NonResident) -NINO_FNS(IndexAllocPresent) -NINO_FNS(Compressed) -NINO_FNS(Encrypted) -NINO_FNS(Sparse) -NINO_FNS(SparseDisabled) -NINO_FNS(TruncateFailed) - -/* - * The full structure containing a ntfs_inode and a vfs struct inode. Used for - * all real and fake inodes but not for extent inodes which lack the vfs struct - * inode. - */ -typedef struct { - ntfs_inode ntfs_inode; - struct inode vfs_inode; /* The vfs inode structure. */ -} big_ntfs_inode; - -/** - * NTFS_I - return the ntfs inode given a vfs inode - * @inode: VFS inode - * - * NTFS_I() returns the ntfs inode associated with the VFS @inode. - */ -static inline ntfs_inode *NTFS_I(struct inode *inode) -{ - return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); -} - -static inline struct inode *VFS_I(ntfs_inode *ni) -{ - return &((big_ntfs_inode *)ni)->vfs_inode; -} - -/** - * ntfs_attr - ntfs in memory attribute structure - * @mft_no: mft record number of the base mft record of this attribute - * @name: Unicode name of the attribute (NULL if unnamed) - * @name_len: length of @name in Unicode characters (0 if unnamed) - * @type: attribute type (see layout.h) - * - * This structure exists only to provide a small structure for the - * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism. - * - * NOTE: Elements are ordered by size to make the structure as compact as - * possible on all architectures. - */ -typedef struct { - unsigned long mft_no; - ntfschar *name; - u32 name_len; - ATTR_TYPE type; -} ntfs_attr; - -extern int ntfs_test_inode(struct inode *vi, void *data); - -extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); -extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, - ntfschar *name, u32 name_len); -extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, - u32 name_len); - -extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); -extern void ntfs_free_big_inode(struct inode *inode); -extern void ntfs_evict_big_inode(struct inode *vi); - -extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); - -static inline void ntfs_init_big_inode(struct inode *vi) -{ - ntfs_inode *ni = NTFS_I(vi); - - ntfs_debug("Entering."); - __ntfs_init_inode(vi->i_sb, ni); - ni->mft_no = vi->i_ino; -} - -extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, - unsigned long mft_no); -extern void ntfs_clear_extent_inode(ntfs_inode *ni); - -extern int ntfs_read_inode_mount(struct inode *vi); - -extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); - -#ifdef NTFS_RW - -extern int ntfs_truncate(struct inode *vi); -extern void ntfs_truncate_vfs(struct inode *vi); - -extern int ntfs_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *attr); - -extern int __ntfs_write_inode(struct inode *vi, int sync); - -static inline void ntfs_commit_inode(struct inode *vi) -{ - if (!is_bad_inode(vi)) - __ntfs_write_inode(vi, 1); - return; -} - -#else - -static inline void ntfs_truncate_vfs(struct inode *vi) {} - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_INODE_H */ diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h deleted file mode 100644 index 5d4bf7a3259f..000000000000 --- a/fs/ntfs/layout.h +++ /dev/null @@ -1,2421 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_LAYOUT_H -#define _LINUX_NTFS_LAYOUT_H - -#include <linux/types.h> -#include <linux/bitops.h> -#include <linux/list.h> -#include <asm/byteorder.h> - -#include "types.h" - -/* The NTFS oem_id "NTFS " */ -#define magicNTFS cpu_to_le64(0x202020205346544eULL) - -/* - * Location of bootsector on partition: - * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition. - * On NT4 and above there is one backup copy of the boot sector to - * be found on the last sector of the partition (not normally accessible - * from within Windows as the bootsector contained number of sectors - * value is one less than the actual value!). - * On versions of NT 3.51 and earlier, the backup copy was located at - * number of sectors/2 (integer divide), i.e. in the middle of the volume. - */ - -/* - * BIOS parameter block (bpb) structure. - */ -typedef struct { - le16 bytes_per_sector; /* Size of a sector in bytes. */ - u8 sectors_per_cluster; /* Size of a cluster in sectors. */ - le16 reserved_sectors; /* zero */ - u8 fats; /* zero */ - le16 root_entries; /* zero */ - le16 sectors; /* zero */ - u8 media_type; /* 0xf8 = hard disk */ - le16 sectors_per_fat; /* zero */ - le16 sectors_per_track; /* irrelevant */ - le16 heads; /* irrelevant */ - le32 hidden_sectors; /* zero */ - le32 large_sectors; /* zero */ -} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK; - -/* - * NTFS boot sector structure. - */ -typedef struct { - u8 jump[3]; /* Irrelevant (jump to boot up code).*/ - le64 oem_id; /* Magic "NTFS ". */ - BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */ - u8 unused[4]; /* zero, NTFS diskedit.exe states that - this is actually: - __u8 physical_drive; // 0x80 - __u8 current_head; // zero - __u8 extended_boot_signature; - // 0x80 - __u8 unused; // zero - */ -/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives - maximum volume size of 2^63 sectors. - Assuming standard sector size of 512 - bytes, the maximum byte size is - approx. 4.7x10^21 bytes. (-; */ - sle64 mft_lcn; /* Cluster location of mft data. */ - sle64 mftmirr_lcn; /* Cluster location of copy of mft. */ - s8 clusters_per_mft_record; /* Mft record size in clusters. */ - u8 reserved0[3]; /* zero */ - s8 clusters_per_index_record; /* Index block size in clusters. */ - u8 reserved1[3]; /* zero */ - le64 volume_serial_number; /* Irrelevant (serial number). */ - le32 checksum; /* Boot sector checksum. */ -/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */ - le16 end_of_sector_marker; /* End of bootsector magic. Always is - 0xaa55 in little endian. */ -/* sizeof() = 512 (0x200) bytes */ -} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR; - -/* - * Magic identifiers present at the beginning of all ntfs record containing - * records (like mft records for example). - */ -enum { - /* Found in $MFT/$DATA. */ - magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ - magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ - magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ - - /* Found in $LogFile/$DATA. */ - magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ - magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ - - /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ - magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ - - /* Found in all ntfs record containing records. */ - magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector - transfer was detected. */ - /* - * Found in $LogFile/$DATA when a page is full of 0xff bytes and is - * thus not initialized. Page must be initialized before using it. - */ - magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ -}; - -typedef le32 NTFS_RECORD_TYPE; - -/* - * Generic magic comparison macros. Finally found a use for the ## preprocessor - * operator! (-8 - */ - -static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r) -{ - return (x == r); -} -#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m) - -static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) -{ - return (*p == r); -} -#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m) - -/* - * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above. - */ -#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) ) -#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) ) -#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) ) -#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) ) -#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) ) -#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) ) -#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) ) -#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) ) - -#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) ) -#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) ) -#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) ) -#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) ) - -#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) ) -#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) ) - -#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) ) -#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) ) - -#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) ) -#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) ) - -/* - * The Update Sequence Array (usa) is an array of the le16 values which belong - * to the end of each sector protected by the update sequence record in which - * this array is contained. Note that the first entry is the Update Sequence - * Number (usn), a cyclic counter of how many times the protected record has - * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All - * last le16's of each sector have to be equal to the usn (during reading) or - * are set to it (during writing). If they are not, an incomplete multi sector - * transfer has occurred when the data was written. - * The maximum size for the update sequence array is fixed to: - * maximum size = usa_ofs + (usa_count * 2) = 510 bytes - * The 510 bytes comes from the fact that the last le16 in the array has to - * (obviously) finish before the last le16 of the first 512-byte sector. - * This formula can be used as a consistency check in that usa_ofs + - * (usa_count * 2) has to be less than or equal to 510. - */ -typedef struct { - NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record - type and/or status. */ - le16 usa_ofs; /* Offset to the Update Sequence Array (usa) - from the start of the ntfs record. */ - le16 usa_count; /* Number of le16 sized entries in the usa - including the Update Sequence Number (usn), - thus the number of fixups is the usa_count - minus 1. */ -} __attribute__ ((__packed__)) NTFS_RECORD; - -/* - * System files mft record numbers. All these files are always marked as used - * in the bitmap attribute of the mft; presumably in order to avoid accidental - * allocation for random other mft records. Also, the sequence number for each - * of the system files is always equal to their mft record number and it is - * never modified. - */ -typedef enum { - FILE_MFT = 0, /* Master file table (mft). Data attribute - contains the entries and bitmap attribute - records which ones are in use (bit==1). */ - FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records - in data attribute. If cluster size > 4kiB, - copy of first N mft records, with - N = cluster_size / mft_record_size. */ - FILE_LogFile = 2, /* Journalling log in data attribute. */ - FILE_Volume = 3, /* Volume name attribute and volume information - attribute (flags and ntfs version). Windows - refers to this file as volume DASD (Direct - Access Storage Device). */ - FILE_AttrDef = 4, /* Array of attribute definitions in data - attribute. */ - FILE_root = 5, /* Root directory. */ - FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in - data attribute. */ - FILE_Boot = 7, /* Boot sector (always at cluster 0) in data - attribute. */ - FILE_BadClus = 8, /* Contains all bad clusters in the non-resident - data attribute. */ - FILE_Secure = 9, /* Shared security descriptors in data attribute - and two indexes into the descriptors. - Appeared in Windows 2000. Before that, this - file was named $Quota but was unused. */ - FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode - characters in data attribute. */ - FILE_Extend = 11, /* Directory containing other system files (eg. - $ObjId, $Quota, $Reparse and $UsnJrnl). This - is new to NTFS3.0. */ - FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */ - FILE_reserved13 = 13, - FILE_reserved14 = 14, - FILE_reserved15 = 15, - FILE_first_user = 16, /* First user file, used as test limit for - whether to allow opening a file or not. */ -} NTFS_SYSTEM_FILES; - -/* - * These are the so far known MFT_RECORD_* flags (16-bit) which contain - * information about the mft record in which they are present. - */ -enum { - MFT_RECORD_IN_USE = cpu_to_le16(0x0001), - MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), -} __attribute__ ((__packed__)); - -typedef le16 MFT_RECORD_FLAGS; - -/* - * mft references (aka file references or file record segment references) are - * used whenever a structure needs to refer to a record in the mft. - * - * A reference consists of a 48-bit index into the mft and a 16-bit sequence - * number used to detect stale references. - * - * For error reporting purposes we treat the 48-bit index as a signed quantity. - * - * The sequence number is a circular counter (skipping 0) describing how many - * times the referenced mft record has been (re)used. This has to match the - * sequence number of the mft record being referenced, otherwise the reference - * is considered stale and removed (FIXME: only ntfsck or the driver itself?). - * - * If the sequence number is zero it is assumed that no sequence number - * consistency checking should be performed. - * - * FIXME: Since inodes are 32-bit as of now, the driver needs to always check - * for high_part being 0 and if not either BUG(), cause a panic() or handle - * the situation in some other way. This shouldn't be a problem as a volume has - * to become HUGE in order to need more than 32-bits worth of mft records. - * Assuming the standard mft record size of 1kb only the records (never mind - * the non-resident attributes, etc.) would require 4Tb of space on their own - * for the first 32 bits worth of records. This is only if some strange person - * doesn't decide to foul play and make the mft sparse which would be a really - * horrible thing to do as it would trash our current driver implementation. )-: - * Do I hear screams "we want 64-bit inodes!" ?!? (-; - * - * FIXME: The mft zone is defined as the first 12% of the volume. This space is - * reserved so that the mft can grow contiguously and hence doesn't become - * fragmented. Volume free space includes the empty part of the mft zone and - * when the volume's free 88% are used up, the mft zone is shrunk by a factor - * of 2, thus making more space available for more files/data. This process is - * repeated every time there is no more free space except for the mft zone until - * there really is no more free space. - */ - -/* - * Typedef the MFT_REF as a 64-bit value for easier handling. - * Also define two unpacking macros to get to the reference (MREF) and - * sequence number (MSEQNO) respectively. - * The _LE versions are to be applied on little endian MFT_REFs. - * Note: The _LE versions will return a CPU endian formatted value! - */ -#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL -#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) - -typedef u64 MFT_REF; -typedef le64 leMFT_REF; - -#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \ - ((MFT_REF)(m) & MFT_REF_MASK_CPU))) -#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s)) - -#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU)) -#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff)) -#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU)) -#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff)) - -#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false) -#define ERR_MREF(x) ((u64)((s64)(x))) -#define MREF_ERR(x) ((int)((s64)(x))) - -/* - * The mft record header present at the beginning of every record in the mft. - * This is followed by a sequence of variable length attribute records which - * is terminated by an attribute of type AT_END which is a truncated attribute - * in that it only consists of the attribute type code AT_END and none of the - * other members of the attribute structure are present. - */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ - le16 usa_ofs; /* See NTFS_RECORD definition above. */ - le16 usa_count; /* See NTFS_RECORD definition above. */ - -/* 8*/ le64 lsn; /* $LogFile sequence number for this record. - Changed every time the record is modified. */ -/* 16*/ le16 sequence_number; /* Number of times this mft record has been - reused. (See description for MFT_REF - above.) NOTE: The increment (skipping zero) - is done when the file is deleted. NOTE: If - this is zero it is left zero. */ -/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of - directory entries referencing this record. - NOTE: Only used in mft base records. - NOTE: When deleting a directory entry we - check the link_count and if it is 1 we - delete the file. Otherwise we delete the - FILE_NAME_ATTR being referenced by the - directory entry from the mft record and - decrement the link_count. - FIXME: Careful with Win32 + DOS names! */ -/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this - mft record from the start of the mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file - is deleted, the MFT_RECORD_IN_USE flag is - set to zero. */ -/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft - record. This should be equal to the mft - record size. */ -/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. - When it is not zero it is a mft reference - pointing to the base mft record to which - this record belongs (this is then used to - locate the attribute list attribute present - in the base record which describes this - extension record and hence might need - modification when the extension record - itself is modified, also locating the - attribute list also means finding the other - potential extents, belonging to the non-base - mft record). */ -/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to - the next attribute added to this mft record. - NOTE: Incremented each time after it is used. - NOTE: Every time the mft record is reused - this number is set to zero. NOTE: The first - instance number is always 0. */ -/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */ -/* 42*/ le16 reserved; /* Reserved/alignment. */ -/* 44*/ le32 mft_record_number; /* Number of this mft record. */ -/* sizeof() = 48 bytes */ -/* - * When (re)using the mft record, we place the update sequence array at this - * offset, i.e. before we start with the attributes. This also makes sense, - * otherwise we could run into problems with the update sequence array - * containing in itself the last two bytes of a sector which would mean that - * multi sector transfer protection wouldn't work. As you can't protect data - * by overwriting it since you then can't get it back... - * When reading we obviously use the data from the ntfs record header. - */ -} __attribute__ ((__packed__)) MFT_RECORD; - -/* This is the version without the NTFS 3.1+ specific fields. */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ - le16 usa_ofs; /* See NTFS_RECORD definition above. */ - le16 usa_count; /* See NTFS_RECORD definition above. */ - -/* 8*/ le64 lsn; /* $LogFile sequence number for this record. - Changed every time the record is modified. */ -/* 16*/ le16 sequence_number; /* Number of times this mft record has been - reused. (See description for MFT_REF - above.) NOTE: The increment (skipping zero) - is done when the file is deleted. NOTE: If - this is zero it is left zero. */ -/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of - directory entries referencing this record. - NOTE: Only used in mft base records. - NOTE: When deleting a directory entry we - check the link_count and if it is 1 we - delete the file. Otherwise we delete the - FILE_NAME_ATTR being referenced by the - directory entry from the mft record and - decrement the link_count. - FIXME: Careful with Win32 + DOS names! */ -/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this - mft record from the start of the mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file - is deleted, the MFT_RECORD_IN_USE flag is - set to zero. */ -/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft - record. This should be equal to the mft - record size. */ -/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. - When it is not zero it is a mft reference - pointing to the base mft record to which - this record belongs (this is then used to - locate the attribute list attribute present - in the base record which describes this - extension record and hence might need - modification when the extension record - itself is modified, also locating the - attribute list also means finding the other - potential extents, belonging to the non-base - mft record). */ -/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to - the next attribute added to this mft record. - NOTE: Incremented each time after it is used. - NOTE: Every time the mft record is reused - this number is set to zero. NOTE: The first - instance number is always 0. */ -/* sizeof() = 42 bytes */ -/* - * When (re)using the mft record, we place the update sequence array at this - * offset, i.e. before we start with the attributes. This also makes sense, - * otherwise we could run into problems with the update sequence array - * containing in itself the last two bytes of a sector which would mean that - * multi sector transfer protection wouldn't work. As you can't protect data - * by overwriting it since you then can't get it back... - * When reading we obviously use the data from the ntfs record header. - */ -} __attribute__ ((__packed__)) MFT_RECORD_OLD; - -/* - * System defined attributes (32-bit). Each attribute type has a corresponding - * attribute name (Unicode string of maximum 64 character length) as described - * by the attribute definitions present in the data attribute of the $AttrDef - * system file. On NTFS 3.0 volumes the names are just as the types are named - * in the below defines exchanging AT_ for the dollar sign ($). If that is not - * a revealing choice of symbol I do not know what is... (-; - */ -enum { - AT_UNUSED = cpu_to_le32( 0), - AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), - AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), - AT_FILE_NAME = cpu_to_le32( 0x30), - AT_OBJECT_ID = cpu_to_le32( 0x40), - AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), - AT_VOLUME_NAME = cpu_to_le32( 0x60), - AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), - AT_DATA = cpu_to_le32( 0x80), - AT_INDEX_ROOT = cpu_to_le32( 0x90), - AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), - AT_BITMAP = cpu_to_le32( 0xb0), - AT_REPARSE_POINT = cpu_to_le32( 0xc0), - AT_EA_INFORMATION = cpu_to_le32( 0xd0), - AT_EA = cpu_to_le32( 0xe0), - AT_PROPERTY_SET = cpu_to_le32( 0xf0), - AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), - AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), - AT_END = cpu_to_le32(0xffffffff) -}; - -typedef le32 ATTR_TYPE; - -/* - * The collation rules for sorting views/indexes/etc (32-bit). - * - * COLLATION_BINARY - Collate by binary compare where the first byte is most - * significant. - * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary - * Unicode values, except that when a character can be uppercased, the - * upper case value collates before the lower case one. - * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation - * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea - * what the difference is. Perhaps the difference is that file names - * would treat some special characters in an odd way (see - * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[] - * for what I mean but COLLATION_UNICODE_STRING would not give any special - * treatment to any characters at all, but this is speculation. - * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key - * values. E.g. used for $SII index in FILE_Secure, which sorts by - * security_id (le32). - * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values. - * E.g. used for $O index in FILE_Extend/$Quota. - * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash - * values and second by ascending security_id values. E.g. used for $SDH - * index in FILE_Secure. - * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending - * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which - * sorts by object_id (16-byte), by splitting up the object_id in four - * le32 values and using them as individual keys. E.g. take the following - * two security_ids, stored as follows on disk: - * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59 - * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45 - * To compare them, they are split into four le32 values each, like so: - * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081 - * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179 - * Now, it is apparent why the 2nd object_id collates after the 1st: the - * first le32 value of the 1st object_id is less than the first le32 of - * the 2nd object_id. If the first le32 values of both object_ids were - * equal then the second le32 values would be compared, etc. - */ -enum { - COLLATION_BINARY = cpu_to_le32(0x00), - COLLATION_FILE_NAME = cpu_to_le32(0x01), - COLLATION_UNICODE_STRING = cpu_to_le32(0x02), - COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), - COLLATION_NTOFS_SID = cpu_to_le32(0x11), - COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), - COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), -}; - -typedef le32 COLLATION_RULE; - -/* - * The flags (32-bit) describing attribute properties in the attribute - * definition structure. FIXME: This information is based on Regis's - * information and, according to him, it is not certain and probably - * incomplete. The INDEXABLE flag is fairly certainly correct as only the file - * name attribute has this flag set and this is the only attribute indexed in - * NT4. - */ -enum { - ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be - indexed. */ - ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type - can be present multiple times in the - mft records of an inode. */ - ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value - must contain at least one non-zero - byte. */ - ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be - indexed and the attribute value must be - unique for the attribute type in all of - the mft records of an inode. */ - ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be - named and the name must be unique for - the attribute type in all of the mft - records of an inode. */ - ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be - resident. */ - ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log - modifications to this attribute, - regardless of whether it is resident or - non-resident. Without this, only log - modifications if the attribute is - resident. */ -}; - -typedef le32 ATTR_DEF_FLAGS; - -/* - * The data attribute of FILE_AttrDef contains a sequence of attribute - * definitions for the NTFS volume. With this, it is supposed to be safe for an - * older NTFS driver to mount a volume containing a newer NTFS version without - * damaging it (that's the theory. In practice it's: not damaging it too much). - * Entries are sorted by attribute type. The flags describe whether the - * attribute can be resident/non-resident and possibly other things, but the - * actual bits are unknown. - */ -typedef struct { -/*hex ofs*/ -/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero - terminated. */ -/* 80*/ ATTR_TYPE type; /* Type of the attribute. */ -/* 84*/ le32 display_rule; /* Default display rule. - FIXME: What does it mean? (AIA) */ -/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */ -/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */ -/* 90*/ sle64 min_size; /* Optional minimum attribute size. */ -/* 98*/ sle64 max_size; /* Maximum size of attribute. */ -/* sizeof() = 0xa0 or 160 bytes */ -} __attribute__ ((__packed__)) ATTR_DEF; - -/* - * Attribute flags (16-bit). - */ -enum { - ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), - ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method - mask. Also, first - illegal value. */ - ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), - ATTR_IS_SPARSE = cpu_to_le16(0x8000), -} __attribute__ ((__packed__)); - -typedef le16 ATTR_FLAGS; - -/* - * Attribute compression. - * - * Only the data attribute is ever compressed in the current ntfs driver in - * Windows. Further, compression is only applied when the data attribute is - * non-resident. Finally, to use compression, the maximum allowed cluster size - * on a volume is 4kib. - * - * The compression method is based on independently compressing blocks of X - * clusters, where X is determined from the compression_unit value found in the - * non-resident attribute record header (more precisely: X = 2^compression_unit - * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4). - * - * There are three different cases of how a compression block of X clusters - * can be stored: - * - * 1) The data in the block is all zero (a sparse block): - * This is stored as a sparse block in the runlist, i.e. the runlist - * entry has length = X and lcn = -1. The mapping pairs array actually - * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at - * all, which is then interpreted by the driver as lcn = -1. - * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then - * the same principles apply as above, except that the length is not - * restricted to being any particular value. - * - * 2) The data in the block is not compressed: - * This happens when compression doesn't reduce the size of the block - * in clusters. I.e. if compression has a small effect so that the - * compressed data still occupies X clusters, then the uncompressed data - * is stored in the block. - * This case is recognised by the fact that the runlist entry has - * length = X and lcn >= 0. The mapping pairs array stores this as - * normal with a run length of X and some specific delta_lcn, i.e. - * delta_lcn has to be present. - * - * 3) The data in the block is compressed: - * The common case. This case is recognised by the fact that the run - * list entry has length L < X and lcn >= 0. The mapping pairs array - * stores this as normal with a run length of X and some specific - * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is - * immediately followed by a sparse entry with length = X - L and - * lcn = -1. The latter entry is to make up the vcn counting to the - * full compression block size X. - * - * In fact, life is more complicated because adjacent entries of the same type - * can be coalesced. This means that one has to keep track of the number of - * clusters handled and work on a basis of X clusters at a time being one - * block. An example: if length L > X this means that this particular runlist - * entry contains a block of length X and part of one or more blocks of length - * L - X. Another example: if length L < X, this does not necessarily mean that - * the block is compressed as it might be that the lcn changes inside the block - * and hence the following runlist entry describes the continuation of the - * potentially compressed block. The block would be compressed if the - * following runlist entry describes at least X - L sparse clusters, thus - * making up the compression block length as described in point 3 above. (Of - * course, there can be several runlist entries with small lengths so that the - * sparse entry does not follow the first data containing entry with - * length < X.) - * - * NOTE: At the end of the compressed attribute value, there most likely is not - * just the right amount of data to make up a compression block, thus this data - * is not even attempted to be compressed. It is just stored as is, unless - * the number of clusters it occupies is reduced when compressed in which case - * it is stored as a compressed compression block, complete with sparse - * clusters at the end. - */ - -/* - * Flags of resident attributes (8-bit). - */ -enum { - RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index - (has implications for deleting and - modifying the attribute). */ -} __attribute__ ((__packed__)); - -typedef u8 RESIDENT_ATTR_FLAGS; - -/* - * Attribute record header. Always aligned to 8-byte boundary. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */ -/* 4*/ le32 length; /* Byte size of the resident part of the - attribute (aligned to 8-byte boundary). - Used to get to the next attribute. */ -/* 8*/ u8 non_resident; /* If 0, attribute is resident. - If 1, attribute is non-resident. */ -/* 9*/ u8 name_length; /* Unicode character size of name of attribute. - 0 if unnamed. */ -/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the - beginning of the name from the attribute - record. Note that the name is stored as a - Unicode string. When creating, place offset - just at the end of the record header. Then, - follow with attribute value or mapping pairs - array, resident and non-resident attributes - respectively, aligning to an 8-byte - boundary. */ -/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */ -/* 14*/ le16 instance; /* The instance of this attribute record. This - number is unique within this mft record (see - MFT_RECORD/next_attribute_instance notes in - mft.h for more details). */ -/* 16*/ union { - /* Resident attributes. */ - struct { -/* 16 */ le32 value_length;/* Byte size of attribute value. */ -/* 20 */ le16 value_offset;/* Byte offset of the attribute - value from the start of the - attribute record. When creating, - align to 8-byte boundary if we - have a name present as this might - not have a length of a multiple - of 8-bytes. */ -/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */ -/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte - boundary. */ - } __attribute__ ((__packed__)) resident; - /* Non-resident attributes. */ - struct { -/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number - for this portion of the attribute value or - 0 if this is the only extent (usually the - case). - Only when an attribute list is used - does lowest_vcn != 0 ever occur. */ -/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of - the attribute value. - Usually there is only one - portion, so this usually equals the attribute - value size in clusters minus 1. Can be -1 for - zero length files. Can be 0 for "single extent" - attributes. */ -/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the - beginning of the structure to the mapping pairs - array which contains the mappings between the - vcns and the logical cluster numbers (lcns). - When creating, place this at the end of this - record header aligned to 8-byte boundary. */ -/* 34*/ u8 compression_unit; /* The compression unit expressed - as the log to the base 2 of the number of - clusters in a compression unit. 0 means not - compressed. (This effectively limits the - compression unit size to be a power of two - clusters.) WinNT4 only uses a value of 4. - Sparse files have this set to 0 on XPSP2. */ -/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ -/* The sizes below are only used when lowest_vcn is zero, as otherwise it would - be difficult to keep them up-to-date.*/ -/* 40*/ sle64 allocated_size; /* Byte size of disk space - allocated to hold the attribute value. Always - is a multiple of the cluster size. When a file - is compressed, this field is a multiple of the - compression block size (2^compression_unit) and - it represents the logically allocated space - rather than the actual on disk usage. For this - use the compressed_size (see below). */ -/* 48*/ sle64 data_size; /* Byte size of the attribute - value. Can be larger than allocated_size if - attribute value is compressed or sparse. */ -/* 56*/ sle64 initialized_size; /* Byte size of initialized - portion of the attribute value. Usually equals - data_size. */ -/* sizeof(uncompressed attr) = 64*/ -/* 64*/ sle64 compressed_size; /* Byte size of the attribute - value after compression. Only present when - compressed or sparse. Always is a multiple of - the cluster size. Represents the actual amount - of disk space being used on the disk. */ -/* sizeof(compressed attr) = 72*/ - } __attribute__ ((__packed__)) non_resident; - } __attribute__ ((__packed__)) data; -} __attribute__ ((__packed__)) ATTR_RECORD; - -typedef ATTR_RECORD ATTR_REC; - -/* - * File attribute flags (32-bit) appearing in the file_attributes fields of the - * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR - * attributes of MFT_RECORDs and directory index entries. - * - * All of the below flags appear in the directory index entries but only some - * appear in the STANDARD_INFORMATION attribute whilst only some others appear - * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the - * flags appear in all of the above. - */ -enum { - FILE_ATTR_READONLY = cpu_to_le32(0x00000001), - FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), - FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), - /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ - - FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), - /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is - reserved for the DOS SUBDIRECTORY flag. */ - FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), - FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), - FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), - - FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), - FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), - FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), - FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), - - FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), - FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), - FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), - - FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), - /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the - FILE_ATTR_DEVICE and preserves everything else. This mask is used - to obtain all flags that are valid for reading. */ - FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), - /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the - F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, - F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask - is used to obtain all flags that are valid for setting. */ - /* - * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all - * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION - * attribute of an mft record. - */ - FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), - /* Note, this is a copy of the corresponding bit from the mft record, - telling us whether this is a directory or not, i.e. whether it has - an index root attribute or not. */ - FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), - /* Note, this is a copy of the corresponding bit from the mft record, - telling us whether this file has a view index present (eg. object id - index, quota index, one of the security indexes or the encrypting - filesystem related indexes). */ -}; - -typedef le32 FILE_ATTR_FLAGS; - -/* - * NOTE on times in NTFS: All times are in MS standard time format, i.e. they - * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00 - * universal coordinated time (UTC). (In Linux time starts 1st January 1970, - * 00:00:00 UTC and is stored as the number of 1-second intervals since then.) - */ - -/* - * Attribute: Standard information (0x10). - * - * NOTE: Always resident. - * NOTE: Present in all base file records on a volume. - * NOTE: There is conflicting information about the meaning of each of the time - * fields but the meaning as defined below has been verified to be - * correct by practical experimentation on Windows NT4 SP6a and is hence - * assumed to be the one and only correct interpretation. - */ -typedef struct { -/*Ofs*/ -/* 0*/ sle64 creation_time; /* Time file was created. Updated when - a filename is changed(?). */ -/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last - modified. */ -/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last - modified. */ -/* 24*/ sle64 last_access_time; /* Approximate time when the file was - last accessed (obviously this is not - updated on read-only volumes). In - Windows this is only updated when - accessed if some time delta has - passed since the last update. Also, - last access time updates can be - disabled altogether for speed. */ -/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ -/* 36*/ union { - /* NTFS 1.2 */ - struct { - /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte - boundary. */ - } __attribute__ ((__packed__)) v1; - /* sizeof() = 48 bytes */ - /* NTFS 3.x */ - struct { -/* - * If a volume has been upgraded from a previous NTFS version, then these - * fields are present only if the file has been accessed since the upgrade. - * Recognize the difference by comparing the length of the resident attribute - * value. If it is 48, then the following fields are missing. If it is 72 then - * the fields are present. Maybe just check like this: - * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) { - * Assume NTFS 1.2- format. - * If (volume version is 3.x) - * Upgrade attribute to NTFS 3.x format. - * else - * Use NTFS 1.2- format for access. - * } else - * Use NTFS 3.x format for access. - * Only problem is that it might be legal to set the length of the value to - * arbitrarily large values thus spoiling this check. - But chkdsk probably - * views that as a corruption, assuming that it behaves like this for all - * attributes. - */ - /* 36*/ le32 maximum_versions; /* Maximum allowed versions for - file. Zero if version numbering is disabled. */ - /* 40*/ le32 version_number; /* This file's version (if any). - Set to zero if maximum_versions is zero. */ - /* 44*/ le32 class_id; /* Class id from bidirectional - class id index (?). */ - /* 48*/ le32 owner_id; /* Owner_id of the user owning - the file. Translate via $Q index in FILE_Extend - /$Quota to the quota control entry for the user - owning the file. Zero if quotas are disabled. */ - /* 52*/ le32 security_id; /* Security_id for the file. - Translate via $SII index and $SDS data stream - in FILE_Secure to the security descriptor. */ - /* 56*/ le64 quota_charged; /* Byte size of the charge to - the quota for all streams of the file. Note: Is - zero if quotas are disabled. */ - /* 64*/ leUSN usn; /* Last update sequence number - of the file. This is a direct index into the - transaction log file ($UsnJrnl). It is zero if - the usn journal is disabled or this file has - not been subject to logging yet. See usnjrnl.h - for details. */ - } __attribute__ ((__packed__)) v3; - /* sizeof() = 72 bytes (NTFS 3.x) */ - } __attribute__ ((__packed__)) ver; -} __attribute__ ((__packed__)) STANDARD_INFORMATION; - -/* - * Attribute: Attribute list (0x20). - * - * - Can be either resident or non-resident. - * - Value consists of a sequence of variable length, 8-byte aligned, - * ATTR_LIST_ENTRY records. - * - The list is not terminated by anything at all! The only way to know when - * the end is reached is to keep track of the current offset and compare it to - * the attribute value size. - * - The attribute list attribute contains one entry for each attribute of - * the file in which the list is located, except for the list attribute - * itself. The list is sorted: first by attribute type, second by attribute - * name (if present), third by instance number. The extents of one - * non-resident attribute (if present) immediately follow after the initial - * extent. They are ordered by lowest_vcn and have their instace set to zero. - * It is not allowed to have two attributes with all sorting keys equal. - * - Further restrictions: - * - If not resident, the vcn to lcn mapping array has to fit inside the - * base mft record. - * - The attribute list attribute value has a maximum size of 256kb. This - * is imposed by the Windows cache manager. - * - Attribute lists are only used when the attributes of mft record do not - * fit inside the mft record despite all attributes (that can be made - * non-resident) having been made non-resident. This can happen e.g. when: - * - File has a large number of hard links (lots of file name - * attributes present). - * - The mapping pairs array of some non-resident attribute becomes so - * large due to fragmentation that it overflows the mft record. - * - The security descriptor is very complex (not applicable to - * NTFS 3.0 volumes). - * - There are many named streams. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */ -/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */ -/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the - attribute or 0 if unnamed. */ -/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name - (always set this to where the name would - start even if unnamed). */ -/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion - of the attribute value. This is usually 0. It - is non-zero for the case where one attribute - does not fit into one mft record and thus - several mft records are allocated to hold - this attribute. In the latter case, each mft - record holds one extent of the attribute and - there is one attribute list entry for each - extent. NOTE: This is DEFINITELY a signed - value! The windows driver uses cmp, followed - by jg when comparing this, thus it treats it - as signed. */ -/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding - the ATTR_RECORD for this portion of the - attribute value. */ -/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the - attribute being referenced; otherwise 0. */ -/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use - name_offset to determine the location of the - name. */ -/* sizeof() = 26 + (attribute_name_length * 2) bytes */ -} __attribute__ ((__packed__)) ATTR_LIST_ENTRY; - -/* - * The maximum allowed length for a file name. - */ -#define MAXIMUM_FILE_NAME_LENGTH 255 - -/* - * Possible namespaces for filenames in ntfs (8-bit). - */ -enum { - FILE_NAME_POSIX = 0x00, - /* This is the largest namespace. It is case sensitive and allows all - Unicode characters except for: '\0' and '/'. Beware that in - WinNT/2k/2003 by default files which eg have the same name except - for their case will not be distinguished by the standard utilities - and thus a "del filename" will delete both "filename" and "fileName" - without warning. However if for example Services For Unix (SFU) are - installed and the case sensitive option was enabled at installation - time, then you can create/access/delete such files. - Note that even SFU places restrictions on the filenames beyond the - '\0' and '/' and in particular the following set of characters is - not allowed: '"', '/', '<', '>', '\'. All other characters, - including the ones no allowed in WIN32 namespace are allowed. - Tested with SFU 3.5 (this is now free) running on Windows XP. */ - FILE_NAME_WIN32 = 0x01, - /* The standard WinNT/2k NTFS long filenames. Case insensitive. All - Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', - and '|'. Further, names cannot end with a '.' or a space. */ - FILE_NAME_DOS = 0x02, - /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit - characters greater space, except: '"', '*', '+', ',', '/', ':', ';', - '<', '=', '>', '?', and '\'. */ - FILE_NAME_WIN32_AND_DOS = 0x03, - /* 3 means that both the Win32 and the DOS filenames are identical and - hence have been saved in this single filename record. */ -} __attribute__ ((__packed__)); - -typedef u8 FILE_NAME_TYPE_FLAGS; - -/* - * Attribute: Filename (0x30). - * - * NOTE: Always resident. - * NOTE: All fields, except the parent_directory, are only updated when the - * filename is changed. Until then, they just become out of sync with - * reality and the more up to date values are present in the standard - * information attribute. - * NOTE: There is conflicting information about the meaning of each of the time - * fields but the meaning as defined below has been verified to be - * correct by practical experimentation on Windows NT4 SP6a and is hence - * assumed to be the one and only correct interpretation. - */ -typedef struct { -/*hex ofs*/ -/* 0*/ leMFT_REF parent_directory; /* Directory this filename is - referenced from. */ -/* 8*/ sle64 creation_time; /* Time file was created. */ -/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last - modified. */ -/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last - modified. */ -/* 20*/ sle64 last_access_time; /* Time this mft record was last - accessed. */ -/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space - for the unnamed data attribute. So - for normal $DATA, this is the - allocated_size from the unnamed - $DATA attribute and for compressed - and/or sparse $DATA, this is the - compressed_size from the unnamed - $DATA attribute. For a directory or - other inode without an unnamed $DATA - attribute, this is always 0. NOTE: - This is a multiple of the cluster - size. */ -/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed - data attribute. For a directory or - other inode without an unnamed $DATA - attribute, this is always 0. */ -/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ -/* 3c*/ union { - /* 3c*/ struct { - /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to - pack the extended attributes - (EAs), if such are present.*/ - /* 3e*/ le16 reserved; /* Reserved for alignment. */ - } __attribute__ ((__packed__)) ea; - /* 3c*/ struct { - /* 3c*/ le32 reparse_point_tag; /* Type of reparse point, - present only in reparse - points and only if there are - no EAs. */ - } __attribute__ ((__packed__)) rp; - } __attribute__ ((__packed__)) type; -/* 40*/ u8 file_name_length; /* Length of file name in - (Unicode) characters. */ -/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/ -/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */ -} __attribute__ ((__packed__)) FILE_NAME_ATTR; - -/* - * GUID structures store globally unique identifiers (GUID). A GUID is a - * 128-bit value consisting of one group of eight hexadecimal digits, followed - * by three groups of four hexadecimal digits each, followed by one group of - * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the - * distributed computing environment (DCE) universally unique identifier (UUID). - * Example of a GUID: - * 1F010768-5A73-BC91-0010A52216A7 - */ -typedef struct { - le32 data1; /* The first eight hexadecimal digits of the GUID. */ - le16 data2; /* The first group of four hexadecimal digits. */ - le16 data3; /* The second group of four hexadecimal digits. */ - u8 data4[8]; /* The first two bytes are the third group of four - hexadecimal digits. The remaining six bytes are the - final 12 hexadecimal digits. */ -} __attribute__ ((__packed__)) GUID; - -/* - * FILE_Extend/$ObjId contains an index named $O. This index contains all - * object_ids present on the volume as the index keys and the corresponding - * mft_record numbers as the index entry data parts. The data part (defined - * below) also contains three other object_ids: - * birth_volume_id - object_id of FILE_Volume on which the file was first - * created. Optional (i.e. can be zero). - * birth_object_id - object_id of file when it was first created. Usually - * equals the object_id. Optional (i.e. can be zero). - * domain_id - Reserved (always zero). - */ -typedef struct { - leMFT_REF mft_reference;/* Mft record containing the object_id in - the index entry key. */ - union { - struct { - GUID birth_volume_id; - GUID birth_object_id; - GUID domain_id; - } __attribute__ ((__packed__)) origin; - u8 extended_info[48]; - } __attribute__ ((__packed__)) opt; -} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA; - -/* - * Attribute: Object id (NTFS 3.0+) (0x40). - * - * NOTE: Always resident. - */ -typedef struct { - GUID object_id; /* Unique id assigned to the - file.*/ - /* The following fields are optional. The attribute value size is 16 - bytes, i.e. sizeof(GUID), if these are not present at all. Note, - the entries can be present but one or more (or all) can be zero - meaning that that particular value(s) is(are) not defined. */ - union { - struct { - GUID birth_volume_id; /* Unique id of volume on which - the file was first created.*/ - GUID birth_object_id; /* Unique id of file when it was - first created. */ - GUID domain_id; /* Reserved, zero. */ - } __attribute__ ((__packed__)) origin; - u8 extended_info[48]; - } __attribute__ ((__packed__)) opt; -} __attribute__ ((__packed__)) OBJECT_ID_ATTR; - -/* - * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in - * the SID structure (see below). - */ -//typedef enum { /* SID string prefix. */ -// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */ -// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */ -// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */ -// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */ -// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */ -// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */ -//} IDENTIFIER_AUTHORITIES; - -/* - * These relative identifiers (RIDs) are used with the above identifier - * authorities to make up universal well-known SIDs. - * - * Note: The relative identifier (RID) refers to the portion of a SID, which - * identifies a user or group in relation to the authority that issued the SID. - * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is - * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and - * the relative identifier SECURITY_CREATOR_OWNER_RID (0). - */ -typedef enum { /* Identifier authority. */ - SECURITY_NULL_RID = 0, /* S-1-0 */ - SECURITY_WORLD_RID = 0, /* S-1-1 */ - SECURITY_LOCAL_RID = 0, /* S-1-2 */ - - SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ - SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ - - SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ - SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ - - SECURITY_DIALUP_RID = 1, - SECURITY_NETWORK_RID = 2, - SECURITY_BATCH_RID = 3, - SECURITY_INTERACTIVE_RID = 4, - SECURITY_SERVICE_RID = 6, - SECURITY_ANONYMOUS_LOGON_RID = 7, - SECURITY_PROXY_RID = 8, - SECURITY_ENTERPRISE_CONTROLLERS_RID=9, - SECURITY_SERVER_LOGON_RID = 9, - SECURITY_PRINCIPAL_SELF_RID = 0xa, - SECURITY_AUTHENTICATED_USER_RID = 0xb, - SECURITY_RESTRICTED_CODE_RID = 0xc, - SECURITY_TERMINAL_SERVER_RID = 0xd, - - SECURITY_LOGON_IDS_RID = 5, - SECURITY_LOGON_IDS_RID_COUNT = 3, - - SECURITY_LOCAL_SYSTEM_RID = 0x12, - - SECURITY_NT_NON_UNIQUE = 0x15, - - SECURITY_BUILTIN_DOMAIN_RID = 0x20, - - /* - * Well-known domain relative sub-authority values (RIDs). - */ - - /* Users. */ - DOMAIN_USER_RID_ADMIN = 0x1f4, - DOMAIN_USER_RID_GUEST = 0x1f5, - DOMAIN_USER_RID_KRBTGT = 0x1f6, - - /* Groups. */ - DOMAIN_GROUP_RID_ADMINS = 0x200, - DOMAIN_GROUP_RID_USERS = 0x201, - DOMAIN_GROUP_RID_GUESTS = 0x202, - DOMAIN_GROUP_RID_COMPUTERS = 0x203, - DOMAIN_GROUP_RID_CONTROLLERS = 0x204, - DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, - DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, - DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207, - DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, - - /* Aliases. */ - DOMAIN_ALIAS_RID_ADMINS = 0x220, - DOMAIN_ALIAS_RID_USERS = 0x221, - DOMAIN_ALIAS_RID_GUESTS = 0x222, - DOMAIN_ALIAS_RID_POWER_USERS = 0x223, - - DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, - DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, - DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, - DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, - - DOMAIN_ALIAS_RID_REPLICATOR = 0x228, - DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, - DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, -} RELATIVE_IDENTIFIERS; - -/* - * The universal well-known SIDs: - * - * NULL_SID S-1-0-0 - * WORLD_SID S-1-1-0 - * LOCAL_SID S-1-2-0 - * CREATOR_OWNER_SID S-1-3-0 - * CREATOR_GROUP_SID S-1-3-1 - * CREATOR_OWNER_SERVER_SID S-1-3-2 - * CREATOR_GROUP_SERVER_SID S-1-3-3 - * - * (Non-unique IDs) S-1-4 - * - * NT well-known SIDs: - * - * NT_AUTHORITY_SID S-1-5 - * DIALUP_SID S-1-5-1 - * - * NETWORD_SID S-1-5-2 - * BATCH_SID S-1-5-3 - * INTERACTIVE_SID S-1-5-4 - * SERVICE_SID S-1-5-6 - * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session) - * PROXY_SID S-1-5-8 - * SERVER_LOGON_SID S-1-5-9 (aka domain controller account) - * SELF_SID S-1-5-10 (self RID) - * AUTHENTICATED_USER_SID S-1-5-11 - * RESTRICTED_CODE_SID S-1-5-12 (running restricted code) - * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server) - * - * (Logon IDs) S-1-5-5-X-Y - * - * (NT non-unique IDs) S-1-5-0x15-... - * - * (Built-in domain) S-1-5-0x20 - */ - -/* - * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure. - * - * NOTE: This is stored as a big endian number, hence the high_part comes - * before the low_part. - */ -typedef union { - struct { - u16 high_part; /* High 16-bits. */ - u32 low_part; /* Low 32-bits. */ - } __attribute__ ((__packed__)) parts; - u8 value[6]; /* Value as individual bytes. */ -} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY; - -/* - * The SID structure is a variable-length structure used to uniquely identify - * users or groups. SID stands for security identifier. - * - * The standard textual representation of the SID is of the form: - * S-R-I-S-S... - * Where: - * - The first "S" is the literal character 'S' identifying the following - * digits as a SID. - * - R is the revision level of the SID expressed as a sequence of digits - * either in decimal or hexadecimal (if the later, prefixed by "0x"). - * - I is the 48-bit identifier_authority, expressed as digits as R above. - * - S... is one or more sub_authority values, expressed as digits as above. - * - * Example SID; the domain-relative SID of the local Administrators group on - * Windows NT/2k: - * S-1-5-32-544 - * This translates to a SID with: - * revision = 1, - * sub_authority_count = 2, - * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY - * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID - * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS - */ -typedef struct { - u8 revision; - u8 sub_authority_count; - SID_IDENTIFIER_AUTHORITY identifier_authority; - le32 sub_authority[1]; /* At least one sub_authority. */ -} __attribute__ ((__packed__)) SID; - -/* - * Current constants for SIDs. - */ -typedef enum { - SID_REVISION = 1, /* Current revision level. */ - SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */ - SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in - a future revision. */ -} SID_CONSTANTS; - -/* - * The predefined ACE types (8-bit, see below). - */ -enum { - ACCESS_MIN_MS_ACE_TYPE = 0, - ACCESS_ALLOWED_ACE_TYPE = 0, - ACCESS_DENIED_ACE_TYPE = 1, - SYSTEM_AUDIT_ACE_TYPE = 2, - SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ - ACCESS_MAX_MS_V2_ACE_TYPE = 3, - - ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4, - ACCESS_MAX_MS_V3_ACE_TYPE = 4, - - /* The following are Win2k only. */ - ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, - ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, - ACCESS_DENIED_OBJECT_ACE_TYPE = 6, - SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, - SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, - ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, - - ACCESS_MAX_MS_V4_ACE_TYPE = 8, - - /* This one is for WinNT/2k. */ - ACCESS_MAX_MS_ACE_TYPE = 8, -} __attribute__ ((__packed__)); - -typedef u8 ACE_TYPES; - -/* - * The ACE flags (8-bit) for audit and inheritance (see below). - * - * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE - * types to indicate that a message is generated (in Windows!) for successful - * accesses. - * - * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types - * to indicate that a message is generated (in Windows!) for failed accesses. - */ -enum { - /* The inheritance flags. */ - OBJECT_INHERIT_ACE = 0x01, - CONTAINER_INHERIT_ACE = 0x02, - NO_PROPAGATE_INHERIT_ACE = 0x04, - INHERIT_ONLY_ACE = 0x08, - INHERITED_ACE = 0x10, /* Win2k only. */ - VALID_INHERIT_FLAGS = 0x1f, - - /* The audit flags. */ - SUCCESSFUL_ACCESS_ACE_FLAG = 0x40, - FAILED_ACCESS_ACE_FLAG = 0x80, -} __attribute__ ((__packed__)); - -typedef u8 ACE_FLAGS; - -/* - * An ACE is an access-control entry in an access-control list (ACL). - * An ACE defines access to an object for a specific user or group or defines - * the types of access that generate system-administration messages or alarms - * for a specific user or group. The user or group is identified by a security - * identifier (SID). - * - * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary), - * which specifies the type and size of the ACE. The format of the subsequent - * data depends on the ACE type. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ACE_TYPES type; /* Type of the ACE. */ -/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */ -/* 2*/ le16 size; /* Size in bytes of the ACE. */ -} __attribute__ ((__packed__)) ACE_HEADER; - -/* - * The access mask (32-bit). Defines the access rights. - * - * The specific rights (bits 0 to 15). These depend on the type of the object - * being secured by the ACE. - */ -enum { - /* Specific rights for files and directories are as follows: */ - - /* Right to read data from the file. (FILE) */ - FILE_READ_DATA = cpu_to_le32(0x00000001), - /* Right to list contents of a directory. (DIRECTORY) */ - FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), - - /* Right to write data to the file. (FILE) */ - FILE_WRITE_DATA = cpu_to_le32(0x00000002), - /* Right to create a file in the directory. (DIRECTORY) */ - FILE_ADD_FILE = cpu_to_le32(0x00000002), - - /* Right to append data to the file. (FILE) */ - FILE_APPEND_DATA = cpu_to_le32(0x00000004), - /* Right to create a subdirectory. (DIRECTORY) */ - FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), - - /* Right to read extended attributes. (FILE/DIRECTORY) */ - FILE_READ_EA = cpu_to_le32(0x00000008), - - /* Right to write extended attributes. (FILE/DIRECTORY) */ - FILE_WRITE_EA = cpu_to_le32(0x00000010), - - /* Right to execute a file. (FILE) */ - FILE_EXECUTE = cpu_to_le32(0x00000020), - /* Right to traverse the directory. (DIRECTORY) */ - FILE_TRAVERSE = cpu_to_le32(0x00000020), - - /* - * Right to delete a directory and all the files it contains (its - * children), even if the files are read-only. (DIRECTORY) - */ - FILE_DELETE_CHILD = cpu_to_le32(0x00000040), - - /* Right to read file attributes. (FILE/DIRECTORY) */ - FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), - - /* Right to change file attributes. (FILE/DIRECTORY) */ - FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), - - /* - * The standard rights (bits 16 to 23). These are independent of the - * type of object being secured. - */ - - /* Right to delete the object. */ - DELETE = cpu_to_le32(0x00010000), - - /* - * Right to read the information in the object's security descriptor, - * not including the information in the SACL, i.e. right to read the - * security descriptor and owner. - */ - READ_CONTROL = cpu_to_le32(0x00020000), - - /* Right to modify the DACL in the object's security descriptor. */ - WRITE_DAC = cpu_to_le32(0x00040000), - - /* Right to change the owner in the object's security descriptor. */ - WRITE_OWNER = cpu_to_le32(0x00080000), - - /* - * Right to use the object for synchronization. Enables a process to - * wait until the object is in the signalled state. Some object types - * do not support this access right. - */ - SYNCHRONIZE = cpu_to_le32(0x00100000), - - /* - * The following STANDARD_RIGHTS_* are combinations of the above for - * convenience and are defined by the Win32 API. - */ - - /* These are currently defined to READ_CONTROL. */ - STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), - STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), - STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), - - /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ - STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), - - /* - * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and - * SYNCHRONIZE access. - */ - STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), - - /* - * The access system ACL and maximum allowed access types (bits 24 to - * 25, bits 26 to 27 are reserved). - */ - ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), - MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), - - /* - * The generic rights (bits 28 to 31). These map onto the standard and - * specific rights. - */ - - /* Read, write, and execute access. */ - GENERIC_ALL = cpu_to_le32(0x10000000), - - /* Execute access. */ - GENERIC_EXECUTE = cpu_to_le32(0x20000000), - - /* - * Write access. For files, this maps onto: - * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA | - * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE - * For directories, the mapping has the same numerical value. See - * above for the descriptions of the rights granted. - */ - GENERIC_WRITE = cpu_to_le32(0x40000000), - - /* - * Read access. For files, this maps onto: - * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA | - * STANDARD_RIGHTS_READ | SYNCHRONIZE - * For directories, the mapping has the same numberical value. See - * above for the descriptions of the rights granted. - */ - GENERIC_READ = cpu_to_le32(0x80000000), -}; - -typedef le32 ACCESS_MASK; - -/* - * The generic mapping array. Used to denote the mapping of each generic - * access right to a specific access mask. - * - * FIXME: What exactly is this and what is it for? (AIA) - */ -typedef struct { - ACCESS_MASK generic_read; - ACCESS_MASK generic_write; - ACCESS_MASK generic_execute; - ACCESS_MASK generic_all; -} __attribute__ ((__packed__)) GENERIC_MAPPING; - -/* - * The predefined ACE type structures are as defined below. - */ - -/* - * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE - */ -typedef struct { -/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ - ACE_TYPES type; /* Type of the ACE. */ - ACE_FLAGS flags; /* Flags describing the ACE. */ - le16 size; /* Size in bytes of the ACE. */ -/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ - -/* 8*/ SID sid; /* The SID associated with the ACE. */ -} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, - SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE; - -/* - * The object ACE flags (32-bit). - */ -enum { - ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), - ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), -}; - -typedef le32 OBJECT_ACE_FLAGS; - -typedef struct { -/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ - ACE_TYPES type; /* Type of the ACE. */ - ACE_FLAGS flags; /* Flags describing the ACE. */ - le16 size; /* Size in bytes of the ACE. */ -/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ - -/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */ -/* 12*/ GUID object_type; -/* 28*/ GUID inherited_object_type; - -/* 44*/ SID sid; /* The SID associated with the ACE. */ -} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE, - ACCESS_DENIED_OBJECT_ACE, - SYSTEM_AUDIT_OBJECT_ACE, - SYSTEM_ALARM_OBJECT_ACE; - -/* - * An ACL is an access-control list (ACL). - * An ACL starts with an ACL header structure, which specifies the size of - * the ACL and the number of ACEs it contains. The ACL header is followed by - * zero or more access control entries (ACEs). The ACL as well as each ACE - * are aligned on 4-byte boundaries. - */ -typedef struct { - u8 revision; /* Revision of this ACL. */ - u8 alignment1; - le16 size; /* Allocated space in bytes for ACL. Includes this - header, the ACEs and the remaining free space. */ - le16 ace_count; /* Number of ACEs in the ACL. */ - le16 alignment2; -/* sizeof() = 8 bytes */ -} __attribute__ ((__packed__)) ACL; - -/* - * Current constants for ACLs. - */ -typedef enum { - /* Current revision. */ - ACL_REVISION = 2, - ACL_REVISION_DS = 4, - - /* History of revisions. */ - ACL_REVISION1 = 1, - MIN_ACL_REVISION = 2, - ACL_REVISION2 = 2, - ACL_REVISION3 = 3, - ACL_REVISION4 = 4, - MAX_ACL_REVISION = 4, -} ACL_CONSTANTS; - -/* - * The security descriptor control flags (16-bit). - * - * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID - * pointed to by the Owner field was provided by a defaulting mechanism - * rather than explicitly provided by the original provider of the - * security descriptor. This may affect the treatment of the SID with - * respect to inheritance of an owner. - * - * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in - * the Group field was provided by a defaulting mechanism rather than - * explicitly provided by the original provider of the security - * descriptor. This may affect the treatment of the SID with respect to - * inheritance of a primary group. - * - * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security - * descriptor contains a discretionary ACL. If this flag is set and the - * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is - * explicitly being specified. - * - * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL - * pointed to by the Dacl field was provided by a defaulting mechanism - * rather than explicitly provided by the original provider of the - * security descriptor. This may affect the treatment of the ACL with - * respect to inheritance of an ACL. This flag is ignored if the - * DaclPresent flag is not set. - * - * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security - * descriptor contains a system ACL pointed to by the Sacl field. If this - * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then - * an empty (but present) ACL is being specified. - * - * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL - * pointed to by the Sacl field was provided by a defaulting mechanism - * rather than explicitly provided by the original provider of the - * security descriptor. This may affect the treatment of the ACL with - * respect to inheritance of an ACL. This flag is ignored if the - * SaclPresent flag is not set. - * - * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security - * descriptor is in self-relative form. In this form, all fields of the - * security descriptor are contiguous in memory and all pointer fields are - * expressed as offsets from the beginning of the security descriptor. - */ -enum { - SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), - SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), - SE_DACL_PRESENT = cpu_to_le16(0x0004), - SE_DACL_DEFAULTED = cpu_to_le16(0x0008), - - SE_SACL_PRESENT = cpu_to_le16(0x0010), - SE_SACL_DEFAULTED = cpu_to_le16(0x0020), - - SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), - SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), - SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), - SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), - - SE_DACL_PROTECTED = cpu_to_le16(0x1000), - SE_SACL_PROTECTED = cpu_to_le16(0x2000), - SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), - SE_SELF_RELATIVE = cpu_to_le16(0x8000) -} __attribute__ ((__packed__)); - -typedef le16 SECURITY_DESCRIPTOR_CONTROL; - -/* - * Self-relative security descriptor. Contains the owner and group SIDs as well - * as the sacl and dacl ACLs inside the security descriptor itself. - */ -typedef struct { - u8 revision; /* Revision level of the security descriptor. */ - u8 alignment; - SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of - the descriptor as well as the following fields. */ - le32 owner; /* Byte offset to a SID representing an object's - owner. If this is NULL, no owner SID is present in - the descriptor. */ - le32 group; /* Byte offset to a SID representing an object's - primary group. If this is NULL, no primary group - SID is present in the descriptor. */ - le32 sacl; /* Byte offset to a system ACL. Only valid, if - SE_SACL_PRESENT is set in the control field. If - SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL - is specified. */ - le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if - SE_DACL_PRESENT is set in the control field. If - SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL - (unconditionally granting access) is specified. */ -/* sizeof() = 0x14 bytes */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE; - -/* - * Absolute security descriptor. Does not contain the owner and group SIDs, nor - * the sacl and dacl ACLs inside the security descriptor. Instead, it contains - * pointers to these structures in memory. Obviously, absolute security - * descriptors are only useful for in memory representations of security - * descriptors. On disk, a self-relative security descriptor is used. - */ -typedef struct { - u8 revision; /* Revision level of the security descriptor. */ - u8 alignment; - SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of - the descriptor as well as the following fields. */ - SID *owner; /* Points to a SID representing an object's owner. If - this is NULL, no owner SID is present in the - descriptor. */ - SID *group; /* Points to a SID representing an object's primary - group. If this is NULL, no primary group SID is - present in the descriptor. */ - ACL *sacl; /* Points to a system ACL. Only valid, if - SE_SACL_PRESENT is set in the control field. If - SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL - is specified. */ - ACL *dacl; /* Points to a discretionary ACL. Only valid, if - SE_DACL_PRESENT is set in the control field. If - SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL - (unconditionally granting access) is specified. */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR; - -/* - * Current constants for security descriptors. - */ -typedef enum { - /* Current revision. */ - SECURITY_DESCRIPTOR_REVISION = 1, - SECURITY_DESCRIPTOR_REVISION1 = 1, - - /* The sizes of both the absolute and relative security descriptors is - the same as pointers, at least on ia32 architecture are 32-bit. */ - SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR), -} SECURITY_DESCRIPTOR_CONSTANTS; - -/* - * Attribute: Security descriptor (0x50). A standard self-relative security - * descriptor. - * - * NOTE: Can be resident or non-resident. - * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally - * in FILE_Secure and the correct descriptor is found using the security_id - * from the standard information attribute. - */ -typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; - -/* - * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one - * referenced instance of each unique security descriptor is stored. - * - * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It - * does, however, contain two indexes ($SDH and $SII) as well as a named data - * stream ($SDS). - * - * Every unique security descriptor is assigned a unique security identifier - * (security_id, not to be confused with a SID). The security_id is unique for - * the NTFS volume and is used as an index into the $SII index, which maps - * security_ids to the security descriptor's storage location within the $SDS - * data attribute. The $SII index is sorted by ascending security_id. - * - * A simple hash is computed from each security descriptor. This hash is used - * as an index into the $SDH index, which maps security descriptor hashes to - * the security descriptor's storage location within the $SDS data attribute. - * The $SDH index is sorted by security descriptor hash and is stored in a B+ - * tree. When searching $SDH (with the intent of determining whether or not a - * new security descriptor is already present in the $SDS data stream), if a - * matching hash is found, but the security descriptors do not match, the - * search in the $SDH index is continued, searching for a next matching hash. - * - * When a precise match is found, the security_id coresponding to the security - * descriptor in the $SDS attribute is read from the found $SDH index entry and - * is stored in the $STANDARD_INFORMATION attribute of the file/directory to - * which the security descriptor is being applied. The $STANDARD_INFORMATION - * attribute is present in all base mft records (i.e. in all files and - * directories). - * - * If a match is not found, the security descriptor is assigned a new unique - * security_id and is added to the $SDS data attribute. Then, entries - * referencing the this security descriptor in the $SDS data attribute are - * added to the $SDH and $SII indexes. - * - * Note: Entries are never deleted from FILE_Secure, even if nothing - * references an entry any more. - */ - -/* - * This header precedes each security descriptor in the $SDS data stream. - * This is also the index entry data part of both the $SII and $SDH indexes. - */ -typedef struct { - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ - le64 offset; /* Byte offset of this entry in the $SDS stream. */ - le32 length; /* Size in bytes of this entry in $SDS stream. */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER; - -/* - * The $SDS data stream contains the security descriptors, aligned on 16-byte - * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot - * cross 256kib boundaries (this restriction is imposed by the Windows cache - * manager). Each security descriptor is contained in a SDS_ENTRY structure. - * Also, each security descriptor is stored twice in the $SDS stream with a - * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) - * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the - * first copy of the security descriptor will be at offset 0x51d0 in the - * $SDS data stream and the second copy will be at offset 0x451d0. - */ -typedef struct { -/*Ofs*/ -/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like - unnamed structs. */ - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ - le64 offset; /* Byte offset of this entry in the $SDS stream. */ - le32 length; /* Size in bytes of this entry in $SDS stream. */ -/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security - descriptor. */ -} __attribute__ ((__packed__)) SDS_ENTRY; - -/* - * The index entry key used in the $SII index. The collation type is - * COLLATION_NTOFS_ULONG. - */ -typedef struct { - le32 security_id; /* The security_id assigned to the descriptor. */ -} __attribute__ ((__packed__)) SII_INDEX_KEY; - -/* - * The index entry key used in the $SDH index. The keys are sorted first by - * hash and then by security_id. The collation rule is - * COLLATION_NTOFS_SECURITY_HASH. - */ -typedef struct { - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ -} __attribute__ ((__packed__)) SDH_INDEX_KEY; - -/* - * Attribute: Volume name (0x60). - * - * NOTE: Always resident. - * NOTE: Present only in FILE_Volume. - */ -typedef struct { - ntfschar name[0]; /* The name of the volume in Unicode. */ -} __attribute__ ((__packed__)) VOLUME_NAME; - -/* - * Possible flags for the volume (16-bit). - */ -enum { - VOLUME_IS_DIRTY = cpu_to_le16(0x0001), - VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), - VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), - VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), - - VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), - VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), - - VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), - VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), - - VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), - - /* To make our life easier when checking if we must mount read-only. */ - VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), -} __attribute__ ((__packed__)); - -typedef le16 VOLUME_FLAGS; - -/* - * Attribute: Volume information (0x70). - * - * NOTE: Always resident. - * NOTE: Present only in FILE_Volume. - * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses - * NTFS 1.2. I haven't personally seen other values yet. - */ -typedef struct { - le64 reserved; /* Not used (yet?). */ - u8 major_ver; /* Major version of the ntfs format. */ - u8 minor_ver; /* Minor version of the ntfs format. */ - VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */ -} __attribute__ ((__packed__)) VOLUME_INFORMATION; - -/* - * Attribute: Data attribute (0x80). - * - * NOTE: Can be resident or non-resident. - * - * Data contents of a file (i.e. the unnamed stream) or of a named stream. - */ -typedef struct { - u8 data[0]; /* The file's data contents. */ -} __attribute__ ((__packed__)) DATA_ATTR; - -/* - * Index header flags (8-bit). - */ -enum { - /* - * When index header is in an index root attribute: - */ - SMALL_INDEX = 0, /* The index is small enough to fit inside the index - root attribute and there is no index allocation - attribute present. */ - LARGE_INDEX = 1, /* The index is too large to fit in the index root - attribute and/or an index allocation attribute is - present. */ - /* - * When index header is in an index block, i.e. is part of index - * allocation attribute: - */ - LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes - branching off it. */ - INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf - node. */ - NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */ -} __attribute__ ((__packed__)); - -typedef u8 INDEX_HEADER_FLAGS; - -/* - * This is the header for indexes, describing the INDEX_ENTRY records, which - * follow the INDEX_HEADER. Together the index header and the index entries - * make up a complete index. - * - * IMPORTANT NOTE: The offset, length and size structure members are counted - * relative to the start of the index header structure and not relative to the - * start of the index root or index allocation structures themselves. - */ -typedef struct { - le32 entries_offset; /* Byte offset to first INDEX_ENTRY - aligned to 8-byte boundary. */ - le32 index_length; /* Data size of the index in bytes, - i.e. bytes used from allocated - size, aligned to 8-byte boundary. */ - le32 allocated_size; /* Byte size of this index (block), - multiple of 8 bytes. */ - /* NOTE: For the index root attribute, the above two numbers are always - equal, as the attribute is resident and it is resized as needed. In - the case of the index allocation attribute the attribute is not - resident and hence the allocated_size is a fixed value and must - equal the index_block_size specified by the INDEX_ROOT attribute - corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK - belongs to. */ - INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */ - u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ -} __attribute__ ((__packed__)) INDEX_HEADER; - -/* - * Attribute: Index root (0x90). - * - * NOTE: Always resident. - * - * This is followed by a sequence of index entries (INDEX_ENTRY structures) - * as described by the index header. - * - * When a directory is small enough to fit inside the index root then this - * is the only attribute describing the directory. When the directory is too - * large to fit in the index root, on the other hand, two additional attributes - * are present: an index allocation attribute, containing sub-nodes of the B+ - * directory tree (see below), and a bitmap attribute, describing which virtual - * cluster numbers (vcns) in the index allocation attribute are in use by an - * index block. - * - * NOTE: The root directory (FILE_root) contains an entry for itself. Other - * directories do not contain entries for themselves, though. - */ -typedef struct { - ATTR_TYPE type; /* Type of the indexed attribute. Is - $FILE_NAME for directories, zero - for view indexes. No other values - allowed. */ - COLLATION_RULE collation_rule; /* Collation rule used to sort the - index entries. If type is $FILE_NAME, - this must be COLLATION_FILE_NAME. */ - le32 index_block_size; /* Size of each index block in bytes (in - the index allocation attribute). */ - u8 clusters_per_index_block; /* Cluster size of each index block (in - the index allocation attribute), when - an index block is >= than a cluster, - otherwise this will be the log of - the size (like how the encoding of - the mft record size and the index - record size found in the boot sector - work). Has to be a power of 2. */ - u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ - INDEX_HEADER index; /* Index header describing the - following index entries. */ -} __attribute__ ((__packed__)) INDEX_ROOT; - -/* - * Attribute: Index allocation (0xa0). - * - * NOTE: Always non-resident (doesn't make sense to be resident anyway!). - * - * This is an array of index blocks. Each index block starts with an - * INDEX_BLOCK structure containing an index header, followed by a sequence of - * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER. - */ -typedef struct { -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Magic is "INDX". */ - le16 usa_ofs; /* See NTFS_RECORD definition. */ - le16 usa_count; /* See NTFS_RECORD definition. */ - -/* 8*/ sle64 lsn; /* $LogFile sequence number of the last - modification of this index block. */ -/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block. - If the cluster_size on the volume is <= the - index_block_size of the directory, - index_block_vcn counts in units of clusters, - and in units of sectors otherwise. */ -/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */ -/* sizeof()= 40 (0x28) bytes */ -/* - * When creating the index block, we place the update sequence array at this - * offset, i.e. before we start with the index entries. This also makes sense, - * otherwise we could run into problems with the update sequence array - * containing in itself the last two bytes of a sector which would mean that - * multi sector transfer protection wouldn't work. As you can't protect data - * by overwriting it since you then can't get it back... - * When reading use the data from the ntfs record header. - */ -} __attribute__ ((__packed__)) INDEX_BLOCK; - -typedef INDEX_BLOCK INDEX_ALLOCATION; - -/* - * The system file FILE_Extend/$Reparse contains an index named $R listing - * all reparse points on the volume. The index entry keys are as defined - * below. Note, that there is no index data associated with the index entries. - * - * The index entries are sorted by the index key file_id. The collation rule is - * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the - * primary key / is not a key at all. (AIA) - */ -typedef struct { - le32 reparse_tag; /* Reparse point type (inc. flags). */ - leMFT_REF file_id; /* Mft record of the file containing the - reparse point attribute. */ -} __attribute__ ((__packed__)) REPARSE_INDEX_KEY; - -/* - * Quota flags (32-bit). - * - * The user quota flags. Names explain meaning. - */ -enum { - QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), - QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), - QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), - - QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), - /* This is a bit mask for the user quota flags. */ - - /* - * These flags are only present in the quota defaults index entry, i.e. - * in the entry where owner_id = QUOTA_DEFAULTS_ID. - */ - QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), - QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), - QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), - QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), - - QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), - QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), - QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), - QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), -}; - -typedef le32 QUOTA_FLAGS; - -/* - * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas - * are on a per volume and per user basis. - * - * The $Q index contains one entry for each existing user_id on the volume. The - * index key is the user_id of the user/group owning this quota control entry, - * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the - * owner_id, is found in the standard information attribute. The collation rule - * for $Q is COLLATION_NTOFS_ULONG. - * - * The $O index contains one entry for each user/group who has been assigned - * a quota on that volume. The index key holds the SID of the user_id the - * entry belongs to, i.e. the owner_id. The collation rule for $O is - * COLLATION_NTOFS_SID. - * - * The $O index entry data is the user_id of the user corresponding to the SID. - * This user_id is used as an index into $Q to find the quota control entry - * associated with the SID. - * - * The $Q index entry data is the quota control entry and is defined below. - */ -typedef struct { - le32 version; /* Currently equals 2. */ - QUOTA_FLAGS flags; /* Flags describing this quota entry. */ - le64 bytes_used; /* How many bytes of the quota are in use. */ - sle64 change_time; /* Last time this quota entry was changed. */ - sle64 threshold; /* Soft quota (-1 if not limited). */ - sle64 limit; /* Hard quota (-1 if not limited). */ - sle64 exceeded_time; /* How long the soft quota has been exceeded. */ - SID sid; /* The SID of the user/object associated with - this quota entry. Equals zero for the quota - defaults entry (and in fact on a WinXP - volume, it is not present at all). */ -} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY; - -/* - * Predefined owner_id values (32-bit). - */ -enum { - QUOTA_INVALID_ID = cpu_to_le32(0x00000000), - QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), - QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), -}; - -/* - * Current constants for quota control entries. - */ -typedef enum { - /* Current version. */ - QUOTA_VERSION = 2, -} QUOTA_CONTROL_ENTRY_CONSTANTS; - -/* - * Index entry flags (16-bit). - */ -enum { - INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a - sub-node, i.e. a reference to an index block in form of - a virtual cluster number (see below). */ - INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last - entry in an index block. The index entry does not - represent a file but it can point to a sub-node. */ - - INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force - enum bit width to 16-bit. */ -} __attribute__ ((__packed__)); - -typedef le16 INDEX_ENTRY_FLAGS; - -/* - * This the index entry header (see below). - */ -typedef struct { -/* 0*/ union { - struct { /* Only valid when INDEX_ENTRY_END is not set. */ - leMFT_REF indexed_file; /* The mft reference of the file - described by this index - entry. Used for directory - indexes. */ - } __attribute__ ((__packed__)) dir; - struct { /* Used for views/indexes to find the entry's data. */ - le16 data_offset; /* Data byte offset from this - INDEX_ENTRY. Follows the - index key. */ - le16 data_length; /* Data length in bytes. */ - le32 reservedV; /* Reserved (zero). */ - } __attribute__ ((__packed__)) vi; - } __attribute__ ((__packed__)) data; -/* 8*/ le16 length; /* Byte size of this index entry, multiple of - 8-bytes. */ -/* 10*/ le16 key_length; /* Byte size of the key value, which is in the - index entry. It follows field reserved. Not - multiple of 8-bytes. */ -/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ -/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */ -/* sizeof() = 16 bytes */ -} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER; - -/* - * This is an index entry. A sequence of such entries follows each INDEX_HEADER - * structure. Together they make up a complete index. The index follows either - * an index root attribute or an index allocation attribute. - * - * NOTE: Before NTFS 3.0 only filename attributes were indexed. - */ -typedef struct { -/*Ofs*/ -/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */ - union { - struct { /* Only valid when INDEX_ENTRY_END is not set. */ - leMFT_REF indexed_file; /* The mft reference of the file - described by this index - entry. Used for directory - indexes. */ - } __attribute__ ((__packed__)) dir; - struct { /* Used for views/indexes to find the entry's data. */ - le16 data_offset; /* Data byte offset from this - INDEX_ENTRY. Follows the - index key. */ - le16 data_length; /* Data length in bytes. */ - le32 reservedV; /* Reserved (zero). */ - } __attribute__ ((__packed__)) vi; - } __attribute__ ((__packed__)) data; - le16 length; /* Byte size of this index entry, multiple of - 8-bytes. */ - le16 key_length; /* Byte size of the key value, which is in the - index entry. It follows field reserved. Not - multiple of 8-bytes. */ - INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ - le16 reserved; /* Reserved/align to 8-byte boundary. */ - -/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present - if INDEX_ENTRY_END bit in flags is not set. NOTE: On - NTFS versions before 3.0 the only valid key is the - FILE_NAME_ATTR. On NTFS 3.0+ the following - additional index keys are defined: */ - FILE_NAME_ATTR file_name;/* $I30 index in directories. */ - SII_INDEX_KEY sii; /* $SII index in $Secure. */ - SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */ - GUID object_id; /* $O index in FILE_Extend/$ObjId: The - object_id of the mft record found in - the data part of the index. */ - REPARSE_INDEX_KEY reparse; /* $R index in - FILE_Extend/$Reparse. */ - SID sid; /* $O index in FILE_Extend/$Quota: - SID of the owner of the user_id. */ - le32 owner_id; /* $Q index in FILE_Extend/$Quota: - user_id of the owner of the quota - control entry in the data part of - the index. */ - } __attribute__ ((__packed__)) key; - /* The (optional) index data is inserted here when creating. */ - // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last - // eight bytes of this index entry contain the virtual - // cluster number of the index block that holds the - // entries immediately preceding the current entry (the - // vcn references the corresponding cluster in the data - // of the non-resident index allocation attribute). If - // the key_length is zero, then the vcn immediately - // follows the INDEX_ENTRY_HEADER. Regardless of - // key_length, the address of the 8-byte boundary - // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by - // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), - // where sizeof(VCN) can be hardcoded as 8 if wanted. */ -} __attribute__ ((__packed__)) INDEX_ENTRY; - -/* - * Attribute: Bitmap (0xb0). - * - * Contains an array of bits (aka a bitfield). - * - * When used in conjunction with the index allocation attribute, each bit - * corresponds to one index block within the index allocation attribute. Thus - * the number of bits in the bitmap * index block size / cluster size is the - * number of clusters in the index allocation attribute. - */ -typedef struct { - u8 bitmap[0]; /* Array of bits. */ -} __attribute__ ((__packed__)) BITMAP_ATTR; - -/* - * The reparse point tag defines the type of the reparse point. It also - * includes several flags, which further describe the reparse point. - * - * The reparse point tag is an unsigned 32-bit value divided in three parts: - * - * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of - * the reparse point. - * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use. - * 3. The most significant three bits are flags describing the reparse point. - * They are defined as follows: - * bit 29: Name surrogate bit. If set, the filename is an alias for - * another object in the system. - * bit 30: High-latency bit. If set, accessing the first byte of data will - * be slow. (E.g. the data is stored on a tape drive.) - * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User - * defined tags have to use zero here. - * - * These are the predefined reparse point tags: - */ -enum { - IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), - IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), - IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), - - IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), - IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), - IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), - - IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), - IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), - IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), - IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), - - IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), - - IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), - - IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), - - IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), -}; - -/* - * Attribute: Reparse point (0xc0). - * - * NOTE: Can be resident or non-resident. - */ -typedef struct { - le32 reparse_tag; /* Reparse point type (inc. flags). */ - le16 reparse_data_length; /* Byte size of reparse data. */ - le16 reserved; /* Align to 8-byte boundary. */ - u8 reparse_data[0]; /* Meaning depends on reparse_tag. */ -} __attribute__ ((__packed__)) REPARSE_POINT; - -/* - * Attribute: Extended attribute (EA) information (0xd0). - * - * NOTE: Always resident. (Is this true???) - */ -typedef struct { - le16 ea_length; /* Byte size of the packed extended - attributes. */ - le16 need_ea_count; /* The number of extended attributes which have - the NEED_EA bit set. */ - le32 ea_query_length; /* Byte size of the buffer required to query - the extended attributes when calling - ZwQueryEaFile() in Windows NT/2k. I.e. the - byte size of the unpacked extended - attributes. */ -} __attribute__ ((__packed__)) EA_INFORMATION; - -/* - * Extended attribute flags (8-bit). - */ -enum { - NEED_EA = 0x80 /* If set the file to which the EA belongs - cannot be interpreted without understanding - the associates extended attributes. */ -} __attribute__ ((__packed__)); - -typedef u8 EA_FLAGS; - -/* - * Attribute: Extended attribute (EA) (0xe0). - * - * NOTE: Can be resident or non-resident. - * - * Like the attribute list and the index buffer list, the EA attribute value is - * a sequence of EA_ATTR variable length records. - */ -typedef struct { - le32 next_entry_offset; /* Offset to the next EA_ATTR. */ - EA_FLAGS flags; /* Flags describing the EA. */ - u8 ea_name_length; /* Length of the name of the EA in bytes - excluding the '\0' byte terminator. */ - le16 ea_value_length; /* Byte size of the EA's value. */ - u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not - Unicode and it is zero terminated. */ - u8 ea_value[0]; /* The value of the EA. Immediately follows - the name. */ -} __attribute__ ((__packed__)) EA_ATTR; - -/* - * Attribute: Property set (0xf0). - * - * Intended to support Native Structure Storage (NSS) - a feature removed from - * NTFS 3.0 during beta testing. - */ -typedef struct { - /* Irrelevant as feature unused. */ -} __attribute__ ((__packed__)) PROPERTY_SET; - -/* - * Attribute: Logged utility stream (0x100). - * - * NOTE: Can be resident or non-resident. - * - * Operations on this attribute are logged to the journal ($LogFile) like - * normal metadata changes. - * - * Used by the Encrypting File System (EFS). All encrypted files have this - * attribute with the name $EFS. - */ -typedef struct { - /* Can be anything the creator chooses. */ - /* EFS uses it as follows: */ - // FIXME: Type this info, verifying it along the way. (AIA) -} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR; - -#endif /* _LINUX_NTFS_LAYOUT_H */ diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c deleted file mode 100644 index eda9972e6159..000000000000 --- a/fs/ntfs/lcnalloc.c +++ /dev/null @@ -1,1000 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include <linux/pagemap.h> - -#include "lcnalloc.h" -#include "debug.h" -#include "bitmap.h" -#include "inode.h" -#include "volume.h" -#include "attrib.h" -#include "malloc.h" -#include "aops.h" -#include "ntfs.h" - -/** - * ntfs_cluster_free_from_rl_nolock - free clusters from runlist - * @vol: mounted ntfs volume on which to free the clusters - * @rl: runlist describing the clusters to free - * - * Free all the clusters described by the runlist @rl on the volume @vol. In - * the case of an error being returned, at least some of the clusters were not - * freed. - * - * Return 0 on success and -errno on error. - * - * Locking: - The volume lcn bitmap must be locked for writing on entry and is - * left locked on return. - */ -int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, - const runlist_element *rl) -{ - struct inode *lcnbmp_vi = vol->lcnbmp_ino; - int ret = 0; - - ntfs_debug("Entering."); - if (!rl) - return 0; - for (; rl->length; rl++) { - int err; - - if (rl->lcn < 0) - continue; - err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length); - if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err)) - ret = err; - } - ntfs_debug("Done."); - return ret; -} - -/** - * ntfs_cluster_alloc - allocate clusters on an ntfs volume - * @vol: mounted ntfs volume on which to allocate the clusters - * @start_vcn: vcn to use for the first allocated cluster - * @count: number of clusters to allocate - * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) - * @zone: zone from which to allocate the clusters - * @is_extension: if 'true', this is an attribute extension - * - * Allocate @count clusters preferably starting at cluster @start_lcn or at the - * current allocator position if @start_lcn is -1, on the mounted ntfs volume - * @vol. @zone is either DATA_ZONE for allocation of normal clusters or - * MFT_ZONE for allocation of clusters for the master file table, i.e. the - * $MFT/$DATA attribute. - * - * @start_vcn specifies the vcn of the first allocated cluster. This makes - * merging the resulting runlist with the old runlist easier. - * - * If @is_extension is 'true', the caller is allocating clusters to extend an - * attribute and if it is 'false', the caller is allocating clusters to fill a - * hole in an attribute. Practically the difference is that if @is_extension - * is 'true' the returned runlist will be terminated with LCN_ENOENT and if - * @is_extension is 'false' the runlist will be terminated with - * LCN_RL_NOT_MAPPED. - * - * You need to check the return value with IS_ERR(). If this is false, the - * function was successful and the return value is a runlist describing the - * allocated cluster(s). If IS_ERR() is true, the function failed and - * PTR_ERR() gives you the error code. - * - * Notes on the allocation algorithm - * ================================= - * - * There are two data zones. First is the area between the end of the mft zone - * and the end of the volume, and second is the area between the start of the - * volume and the start of the mft zone. On unmodified/standard NTFS 1.x - * volumes, the second data zone does not exist due to the mft zone being - * expanded to cover the start of the volume in order to reserve space for the - * mft bitmap attribute. - * - * This is not the prettiest function but the complexity stems from the need of - * implementing the mft vs data zoned approach and from the fact that we have - * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we - * need to cope with crossing over boundaries of two buffers. Further, the - * fact that the allocator allows for caller supplied hints as to the location - * of where allocation should begin and the fact that the allocator keeps track - * of where in the data zones the next natural allocation should occur, - * contribute to the complexity of the function. But it should all be - * worthwhile, because this allocator should: 1) be a full implementation of - * the MFT zone approach used by Windows NT, 2) cause reduction in - * fragmentation, and 3) be speedy in allocations (the code is not optimized - * for speed, but the algorithm is, so further speed improvements are probably - * possible). - * - * FIXME: We should be monitoring cluster allocation and increment the MFT zone - * size dynamically but this is something for the future. We will just cause - * heavier fragmentation by not doing it and I am not even sure Windows would - * grow the MFT zone dynamically, so it might even be correct not to do this. - * The overhead in doing dynamic MFT zone expansion would be very large and - * unlikely worth the effort. (AIA) - * - * TODO: I have added in double the required zone position pointer wrap around - * logic which can be optimized to having only one of the two logic sets. - * However, having the double logic will work fine, but if we have only one of - * the sets and we get it wrong somewhere, then we get into trouble, so - * removing the duplicate logic requires _very_ careful consideration of _all_ - * possible code paths. So at least for now, I am leaving the double logic - - * better safe than sorry... (AIA) - * - * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked - * on return. - * - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - */ -runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, - const s64 count, const LCN start_lcn, - const NTFS_CLUSTER_ALLOCATION_ZONES zone, - const bool is_extension) -{ - LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; - LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; - s64 clusters; - loff_t i_size; - struct inode *lcnbmp_vi; - runlist_element *rl = NULL; - struct address_space *mapping; - struct page *page = NULL; - u8 *buf, *byte; - int err = 0, rlpos, rlsize, buf_size; - u8 pass, done_zones, search_zone, need_writeback = 0, bit; - - ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn " - "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn, - (unsigned long long)count, - (unsigned long long)start_lcn, - zone == MFT_ZONE ? "MFT" : "DATA"); - BUG_ON(!vol); - lcnbmp_vi = vol->lcnbmp_ino; - BUG_ON(!lcnbmp_vi); - BUG_ON(start_vcn < 0); - BUG_ON(count < 0); - BUG_ON(start_lcn < -1); - BUG_ON(zone < FIRST_ZONE); - BUG_ON(zone > LAST_ZONE); - - /* Return NULL if @count is zero. */ - if (!count) - return NULL; - /* Take the lcnbmp lock for writing. */ - down_write(&vol->lcnbmp_lock); - /* - * If no specific @start_lcn was requested, use the current data zone - * position, otherwise use the requested @start_lcn but make sure it - * lies outside the mft zone. Also set done_zones to 0 (no zones done) - * and pass depending on whether we are starting inside a zone (1) or - * at the beginning of a zone (2). If requesting from the MFT_ZONE, - * we either start at the current position within the mft zone or at - * the specified position. If the latter is out of bounds then we start - * at the beginning of the MFT_ZONE. - */ - done_zones = 0; - pass = 1; - /* - * zone_start and zone_end are the current search range. search_zone - * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of - * volume) and 4 for data zone 2 (start of volume till start of mft - * zone). - */ - zone_start = start_lcn; - if (zone_start < 0) { - if (zone == DATA_ZONE) - zone_start = vol->data1_zone_pos; - else - zone_start = vol->mft_zone_pos; - if (!zone_start) { - /* - * Zone starts at beginning of volume which means a - * single pass is sufficient. - */ - pass = 2; - } - } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start && - zone_start < vol->mft_zone_end) { - zone_start = vol->mft_zone_end; - /* - * Starting at beginning of data1_zone which means a single - * pass in this zone is sufficient. - */ - pass = 2; - } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start || - zone_start >= vol->mft_zone_end)) { - zone_start = vol->mft_lcn; - if (!vol->mft_zone_end) - zone_start = 0; - /* - * Starting at beginning of volume which means a single pass - * is sufficient. - */ - pass = 2; - } - if (zone == MFT_ZONE) { - zone_end = vol->mft_zone_end; - search_zone = 1; - } else /* if (zone == DATA_ZONE) */ { - /* Skip searching the mft zone. */ - done_zones |= 1; - if (zone_start >= vol->mft_zone_end) { - zone_end = vol->nr_clusters; - search_zone = 2; - } else { - zone_end = vol->mft_zone_start; - search_zone = 4; - } - } - /* - * bmp_pos is the current bit position inside the bitmap. We use - * bmp_initial_pos to determine whether or not to do a zone switch. - */ - bmp_pos = bmp_initial_pos = zone_start; - - /* Loop until all clusters are allocated, i.e. clusters == 0. */ - clusters = count; - rlpos = rlsize = 0; - mapping = lcnbmp_vi->i_mapping; - i_size = i_size_read(lcnbmp_vi); - while (1) { - ntfs_debug("Start of outer while loop: done_zones 0x%x, " - "search_zone %i, pass %i, zone_start 0x%llx, " - "zone_end 0x%llx, bmp_initial_pos 0x%llx, " - "bmp_pos 0x%llx, rlpos %i, rlsize %i.", - done_zones, search_zone, pass, - (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)bmp_initial_pos, - (unsigned long long)bmp_pos, rlpos, rlsize); - /* Loop until we run out of free clusters. */ - last_read_pos = bmp_pos >> 3; - ntfs_debug("last_read_pos 0x%llx.", - (unsigned long long)last_read_pos); - if (last_read_pos > i_size) { - ntfs_debug("End of attribute reached. " - "Skipping to zone_pass_done."); - goto zone_pass_done; - } - if (likely(page)) { - if (need_writeback) { - ntfs_debug("Marking page dirty."); - flush_dcache_page(page); - set_page_dirty(page); - need_writeback = 0; - } - ntfs_unmap_page(page); - } - page = ntfs_map_page(mapping, last_read_pos >> - PAGE_SHIFT); - if (IS_ERR(page)) { - err = PTR_ERR(page); - ntfs_error(vol->sb, "Failed to map page."); - goto out; - } - buf_size = last_read_pos & ~PAGE_MASK; - buf = page_address(page) + buf_size; - buf_size = PAGE_SIZE - buf_size; - if (unlikely(last_read_pos + buf_size > i_size)) - buf_size = i_size - last_read_pos; - buf_size <<= 3; - lcn = bmp_pos & 7; - bmp_pos &= ~(LCN)7; - ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, " - "bmp_pos 0x%llx, need_writeback %i.", buf_size, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, need_writeback); - while (lcn < buf_size && lcn + bmp_pos < zone_end) { - byte = buf + (lcn >> 3); - ntfs_debug("In inner while loop: buf_size %i, " - "lcn 0x%llx, bmp_pos 0x%llx, " - "need_writeback %i, byte ofs 0x%x, " - "*byte 0x%x.", buf_size, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, - need_writeback, - (unsigned int)(lcn >> 3), - (unsigned int)*byte); - /* Skip full bytes. */ - if (*byte == 0xff) { - lcn = (lcn + 8) & ~(LCN)7; - ntfs_debug("Continuing while loop 1."); - continue; - } - bit = 1 << (lcn & 7); - ntfs_debug("bit 0x%x.", bit); - /* If the bit is already set, go onto the next one. */ - if (*byte & bit) { - lcn++; - ntfs_debug("Continuing while loop 2."); - continue; - } - /* - * Allocate more memory if needed, including space for - * the terminator element. - * ntfs_malloc_nofs() operates on whole pages only. - */ - if ((rlpos + 2) * sizeof(*rl) > rlsize) { - runlist_element *rl2; - - ntfs_debug("Reallocating memory."); - if (!rl) - ntfs_debug("First free bit is at LCN " - "0x%llx.", - (unsigned long long) - (lcn + bmp_pos)); - rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); - if (unlikely(!rl2)) { - err = -ENOMEM; - ntfs_error(vol->sb, "Failed to " - "allocate memory."); - goto out; - } - memcpy(rl2, rl, rlsize); - ntfs_free(rl); - rl = rl2; - rlsize += PAGE_SIZE; - ntfs_debug("Reallocated memory, rlsize 0x%x.", - rlsize); - } - /* Allocate the bitmap bit. */ - *byte |= bit; - /* We need to write this bitmap page to disk. */ - need_writeback = 1; - ntfs_debug("*byte 0x%x, need_writeback is set.", - (unsigned int)*byte); - /* - * Coalesce with previous run if adjacent LCNs. - * Otherwise, append a new run. - */ - ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), " - "prev_lcn 0x%llx, lcn 0x%llx, " - "bmp_pos 0x%llx, prev_run_len 0x%llx, " - "rlpos %i.", - (unsigned long long)(lcn + bmp_pos), - 1ULL, (unsigned long long)prev_lcn, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, - (unsigned long long)prev_run_len, - rlpos); - if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) { - ntfs_debug("Coalescing to run (lcn 0x%llx, " - "len 0x%llx).", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length); - rl[rlpos - 1].length = ++prev_run_len; - ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), " - "prev_run_len 0x%llx.", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length, - (unsigned long long) - prev_run_len); - } else { - if (likely(rlpos)) { - ntfs_debug("Adding new run, (previous " - "run lcn 0x%llx, " - "len 0x%llx).", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length); - rl[rlpos].vcn = rl[rlpos - 1].vcn + - prev_run_len; - } else { - ntfs_debug("Adding new run, is first " - "run."); - rl[rlpos].vcn = start_vcn; - } - rl[rlpos].lcn = prev_lcn = lcn + bmp_pos; - rl[rlpos].length = prev_run_len = 1; - rlpos++; - } - /* Done? */ - if (!--clusters) { - LCN tc; - /* - * Update the current zone position. Positions - * of already scanned zones have been updated - * during the respective zone switches. - */ - tc = lcn + bmp_pos + 1; - ntfs_debug("Done. Updating current zone " - "position, tc 0x%llx, " - "search_zone %i.", - (unsigned long long)tc, - search_zone); - switch (search_zone) { - case 1: - ntfs_debug("Before checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - if (tc >= vol->mft_zone_end) { - vol->mft_zone_pos = - vol->mft_lcn; - if (!vol->mft_zone_end) - vol->mft_zone_pos = 0; - } else if ((bmp_initial_pos >= - vol->mft_zone_pos || - tc > vol->mft_zone_pos) - && tc >= vol->mft_lcn) - vol->mft_zone_pos = tc; - ntfs_debug("After checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - break; - case 2: - ntfs_debug("Before checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - if (tc >= vol->nr_clusters) - vol->data1_zone_pos = - vol->mft_zone_end; - else if ((bmp_initial_pos >= - vol->data1_zone_pos || - tc > vol->data1_zone_pos) - && tc >= vol->mft_zone_end) - vol->data1_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - break; - case 4: - ntfs_debug("Before checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - if (tc >= vol->mft_zone_start) - vol->data2_zone_pos = 0; - else if (bmp_initial_pos >= - vol->data2_zone_pos || - tc > vol->data2_zone_pos) - vol->data2_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - break; - default: - BUG(); - } - ntfs_debug("Finished. Going to out."); - goto out; - } - lcn++; - } - bmp_pos += buf_size; - ntfs_debug("After inner while loop: buf_size 0x%x, lcn " - "0x%llx, bmp_pos 0x%llx, need_writeback %i.", - buf_size, (unsigned long long)lcn, - (unsigned long long)bmp_pos, need_writeback); - if (bmp_pos < zone_end) { - ntfs_debug("Continuing outer while loop, " - "bmp_pos 0x%llx, zone_end 0x%llx.", - (unsigned long long)bmp_pos, - (unsigned long long)zone_end); - continue; - } -zone_pass_done: /* Finished with the current zone pass. */ - ntfs_debug("At zone_pass_done, pass %i.", pass); - if (pass == 1) { - /* - * Now do pass 2, scanning the first part of the zone - * we omitted in pass 1. - */ - pass = 2; - zone_end = zone_start; - switch (search_zone) { - case 1: /* mft_zone */ - zone_start = vol->mft_zone_start; - break; - case 2: /* data1_zone */ - zone_start = vol->mft_zone_end; - break; - case 4: /* data2_zone */ - zone_start = 0; - break; - default: - BUG(); - } - /* Sanity check. */ - if (zone_end < zone_start) - zone_end = zone_start; - bmp_pos = zone_start; - ntfs_debug("Continuing outer while loop, pass 2, " - "zone_start 0x%llx, zone_end 0x%llx, " - "bmp_pos 0x%llx.", - (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)bmp_pos); - continue; - } /* pass == 2 */ -done_zones_check: - ntfs_debug("At done_zones_check, search_zone %i, done_zones " - "before 0x%x, done_zones after 0x%x.", - search_zone, done_zones, - done_zones | search_zone); - done_zones |= search_zone; - if (done_zones < 7) { - ntfs_debug("Switching zone."); - /* Now switch to the next zone we haven't done yet. */ - pass = 1; - switch (search_zone) { - case 1: - ntfs_debug("Switching from mft zone to data1 " - "zone."); - /* Update mft zone position. */ - if (rlpos) { - LCN tc; - - ntfs_debug("Before checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - tc = rl[rlpos - 1].lcn + - rl[rlpos - 1].length; - if (tc >= vol->mft_zone_end) { - vol->mft_zone_pos = - vol->mft_lcn; - if (!vol->mft_zone_end) - vol->mft_zone_pos = 0; - } else if ((bmp_initial_pos >= - vol->mft_zone_pos || - tc > vol->mft_zone_pos) - && tc >= vol->mft_lcn) - vol->mft_zone_pos = tc; - ntfs_debug("After checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) - vol->mft_zone_pos); - } - /* Switch from mft zone to data1 zone. */ -switch_to_data1_zone: search_zone = 2; - zone_start = bmp_initial_pos = - vol->data1_zone_pos; - zone_end = vol->nr_clusters; - if (zone_start == vol->mft_zone_end) - pass = 2; - if (zone_start >= zone_end) { - vol->data1_zone_pos = zone_start = - vol->mft_zone_end; - pass = 2; - } - break; - case 2: - ntfs_debug("Switching from data1 zone to " - "data2 zone."); - /* Update data1 zone position. */ - if (rlpos) { - LCN tc; - - ntfs_debug("Before checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - tc = rl[rlpos - 1].lcn + - rl[rlpos - 1].length; - if (tc >= vol->nr_clusters) - vol->data1_zone_pos = - vol->mft_zone_end; - else if ((bmp_initial_pos >= - vol->data1_zone_pos || - tc > vol->data1_zone_pos) - && tc >= vol->mft_zone_end) - vol->data1_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data1_zone_pos); - } - /* Switch from data1 zone to data2 zone. */ - search_zone = 4; - zone_start = bmp_initial_pos = - vol->data2_zone_pos; - zone_end = vol->mft_zone_start; - if (!zone_start) - pass = 2; - if (zone_start >= zone_end) { - vol->data2_zone_pos = zone_start = - bmp_initial_pos = 0; - pass = 2; - } - break; - case 4: - ntfs_debug("Switching from data2 zone to " - "data1 zone."); - /* Update data2 zone position. */ - if (rlpos) { - LCN tc; - - ntfs_debug("Before checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - tc = rl[rlpos - 1].lcn + - rl[rlpos - 1].length; - if (tc >= vol->mft_zone_start) - vol->data2_zone_pos = 0; - else if (bmp_initial_pos >= - vol->data2_zone_pos || - tc > vol->data2_zone_pos) - vol->data2_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) - vol->data2_zone_pos); - } - /* Switch from data2 zone to data1 zone. */ - goto switch_to_data1_zone; - default: - BUG(); - } - ntfs_debug("After zone switch, search_zone %i, " - "pass %i, bmp_initial_pos 0x%llx, " - "zone_start 0x%llx, zone_end 0x%llx.", - search_zone, pass, - (unsigned long long)bmp_initial_pos, - (unsigned long long)zone_start, - (unsigned long long)zone_end); - bmp_pos = zone_start; - if (zone_start == zone_end) { - ntfs_debug("Empty zone, going to " - "done_zones_check."); - /* Empty zone. Don't bother searching it. */ - goto done_zones_check; - } - ntfs_debug("Continuing outer while loop."); - continue; - } /* done_zones == 7 */ - ntfs_debug("All zones are finished."); - /* - * All zones are finished! If DATA_ZONE, shrink mft zone. If - * MFT_ZONE, we have really run out of space. - */ - mft_zone_size = vol->mft_zone_end - vol->mft_zone_start; - ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end " - "0x%llx, mft_zone_size 0x%llx.", - (unsigned long long)vol->mft_zone_start, - (unsigned long long)vol->mft_zone_end, - (unsigned long long)mft_zone_size); - if (zone == MFT_ZONE || mft_zone_size <= 0) { - ntfs_debug("No free clusters left, going to out."); - /* Really no more space left on device. */ - err = -ENOSPC; - goto out; - } /* zone == DATA_ZONE && mft_zone_size > 0 */ - ntfs_debug("Shrinking mft zone."); - zone_end = vol->mft_zone_end; - mft_zone_size >>= 1; - if (mft_zone_size > 0) - vol->mft_zone_end = vol->mft_zone_start + mft_zone_size; - else /* mft zone and data2 zone no longer exist. */ - vol->data2_zone_pos = vol->mft_zone_start = - vol->mft_zone_end = 0; - if (vol->mft_zone_pos >= vol->mft_zone_end) { - vol->mft_zone_pos = vol->mft_lcn; - if (!vol->mft_zone_end) - vol->mft_zone_pos = 0; - } - bmp_pos = zone_start = bmp_initial_pos = - vol->data1_zone_pos = vol->mft_zone_end; - search_zone = 2; - pass = 2; - done_zones &= ~2; - ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, " - "vol->mft_zone_start 0x%llx, " - "vol->mft_zone_end 0x%llx, " - "vol->mft_zone_pos 0x%llx, search_zone 2, " - "pass 2, dones_zones 0x%x, zone_start 0x%llx, " - "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, " - "continuing outer while loop.", - (unsigned long long)mft_zone_size, - (unsigned long long)vol->mft_zone_start, - (unsigned long long)vol->mft_zone_end, - (unsigned long long)vol->mft_zone_pos, - done_zones, (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)vol->data1_zone_pos); - } - ntfs_debug("After outer while loop."); -out: - ntfs_debug("At out."); - /* Add runlist terminator element. */ - if (likely(rl)) { - rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length; - rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED; - rl[rlpos].length = 0; - } - if (likely(page && !IS_ERR(page))) { - if (need_writeback) { - ntfs_debug("Marking page dirty."); - flush_dcache_page(page); - set_page_dirty(page); - need_writeback = 0; - } - ntfs_unmap_page(page); - } - if (likely(!err)) { - up_write(&vol->lcnbmp_lock); - ntfs_debug("Done."); - return rl; - } - ntfs_error(vol->sb, "Failed to allocate clusters, aborting " - "(error %i).", err); - if (rl) { - int err2; - - if (err == -ENOSPC) - ntfs_debug("Not enough space to complete allocation, " - "err -ENOSPC, first free lcn 0x%llx, " - "could allocate up to 0x%llx " - "clusters.", - (unsigned long long)rl[0].lcn, - (unsigned long long)(count - clusters)); - /* Deallocate all allocated clusters. */ - ntfs_debug("Attempting rollback..."); - err2 = ntfs_cluster_free_from_rl_nolock(vol, rl); - if (err2) { - ntfs_error(vol->sb, "Failed to rollback (error %i). " - "Leaving inconsistent metadata! " - "Unmount and run chkdsk.", err2); - NVolSetErrors(vol); - } - /* Free the runlist. */ - ntfs_free(rl); - } else if (err == -ENOSPC) - ntfs_debug("No space left at all, err = -ENOSPC, first free " - "lcn = 0x%llx.", - (long long)vol->data1_zone_pos); - up_write(&vol->lcnbmp_lock); - return ERR_PTR(err); -} - -/** - * __ntfs_cluster_free - free clusters on an ntfs volume - * @ni: ntfs inode whose runlist describes the clusters to free - * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters - * @count: number of clusters to free or -1 for all clusters - * @ctx: active attribute search context if present or NULL if not - * @is_rollback: true if this is a rollback operation - * - * Free @count clusters starting at the cluster @start_vcn in the runlist - * described by the vfs inode @ni. - * - * If @count is -1, all clusters from @start_vcn to the end of the runlist are - * deallocated. Thus, to completely free all clusters in a runlist, use - * @start_vcn = 0 and @count = -1. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when __ntfs_cluster_free() encounters unmapped - * runlist fragments and allows their mapping. If you do not have the mft - * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will - * perform the necessary mapping and unmapping. - * - * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it - * before returning. Thus, @ctx will be left pointing to the same attribute on - * return as on entry. However, the actual pointers in @ctx may point to - * different memory locations on return, so you must remember to reset any - * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(), - * you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * @is_rollback should always be 'false', it is for internal use to rollback - * errors. You probably want to use ntfs_cluster_free() instead. - * - * Note, __ntfs_cluster_free() does not modify the runlist, so you have to - * remove from the runlist or mark sparse the freed runs later. - * - * Return the number of deallocated clusters (not counting sparse ones) on - * success and -errno on error. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - The volume lcn bitmap must be unlocked on entry and is unlocked - * on return. - * - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, - ntfs_attr_search_ctx *ctx, const bool is_rollback) -{ - s64 delta, to_free, total_freed, real_freed; - ntfs_volume *vol; - struct inode *lcnbmp_vi; - runlist_element *rl; - int err; - - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count " - "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn, - (unsigned long long)count, - is_rollback ? " (rollback)" : ""); - vol = ni->vol; - lcnbmp_vi = vol->lcnbmp_ino; - BUG_ON(!lcnbmp_vi); - BUG_ON(start_vcn < 0); - BUG_ON(count < -1); - /* - * Lock the lcn bitmap for writing but only if not rolling back. We - * must hold the lock all the way including through rollback otherwise - * rollback is not possible because once we have cleared a bit and - * dropped the lock, anyone could have set the bit again, thus - * allocating the cluster for another use. - */ - if (likely(!is_rollback)) - down_write(&vol->lcnbmp_lock); - - total_freed = real_freed = 0; - - rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx); - if (IS_ERR(rl)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to find first runlist " - "element (error %li), aborting.", - PTR_ERR(rl)); - err = PTR_ERR(rl); - goto err_out; - } - if (unlikely(rl->lcn < LCN_HOLE)) { - if (!is_rollback) - ntfs_error(vol->sb, "First runlist element has " - "invalid lcn, aborting."); - err = -EIO; - goto err_out; - } - /* Find the starting cluster inside the run that needs freeing. */ - delta = start_vcn - rl->vcn; - - /* The number of clusters in this run that need freeing. */ - to_free = rl->length - delta; - if (count >= 0 && to_free > count) - to_free = count; - - if (likely(rl->lcn >= 0)) { - /* Do the actual freeing of the clusters in this run. */ - err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta, - to_free, likely(!is_rollback) ? 0 : 1); - if (unlikely(err)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to clear first run " - "(error %i), aborting.", err); - goto err_out; - } - /* We have freed @to_free real clusters. */ - real_freed = to_free; - }; - /* Go to the next run and adjust the number of clusters left to free. */ - ++rl; - if (count >= 0) - count -= to_free; - - /* Keep track of the total "freed" clusters, including sparse ones. */ - total_freed = to_free; - /* - * Loop over the remaining runs, using @count as a capping value, and - * free them. - */ - for (; rl->length && count != 0; ++rl) { - if (unlikely(rl->lcn < LCN_HOLE)) { - VCN vcn; - - /* Attempt to map runlist. */ - vcn = rl->vcn; - rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (!is_rollback) - ntfs_error(vol->sb, "Failed to map " - "runlist fragment or " - "failed to find " - "subsequent runlist " - "element."); - goto err_out; - } - if (unlikely(rl->lcn < LCN_HOLE)) { - if (!is_rollback) - ntfs_error(vol->sb, "Runlist element " - "has invalid lcn " - "(0x%llx).", - (unsigned long long) - rl->lcn); - err = -EIO; - goto err_out; - } - } - /* The number of clusters in this run that need freeing. */ - to_free = rl->length; - if (count >= 0 && to_free > count) - to_free = count; - - if (likely(rl->lcn >= 0)) { - /* Do the actual freeing of the clusters in the run. */ - err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn, - to_free, likely(!is_rollback) ? 0 : 1); - if (unlikely(err)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to clear " - "subsequent run."); - goto err_out; - } - /* We have freed @to_free real clusters. */ - real_freed += to_free; - } - /* Adjust the number of clusters left to free. */ - if (count >= 0) - count -= to_free; - - /* Update the total done clusters. */ - total_freed += to_free; - } - if (likely(!is_rollback)) - up_write(&vol->lcnbmp_lock); - - BUG_ON(count > 0); - - /* We are done. Return the number of actually freed clusters. */ - ntfs_debug("Done."); - return real_freed; -err_out: - if (is_rollback) - return err; - /* If no real clusters were freed, no need to rollback. */ - if (!real_freed) { - up_write(&vol->lcnbmp_lock); - return err; - } - /* - * Attempt to rollback and if that succeeds just return the error code. - * If rollback fails, set the volume errors flag, emit an error - * message, and return the error code. - */ - delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true); - if (delta < 0) { - ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " - "inconsistent metadata! Unmount and run " - "chkdsk.", (int)delta); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); - ntfs_error(vol->sb, "Aborting (error %i).", err); - return err; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h deleted file mode 100644 index 1589a6d8434b..000000000000 --- a/fs/ntfs/lcnalloc.h +++ /dev/null @@ -1,131 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2004-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_LCNALLOC_H -#define _LINUX_NTFS_LCNALLOC_H - -#ifdef NTFS_RW - -#include <linux/fs.h> - -#include "attrib.h" -#include "types.h" -#include "inode.h" -#include "runlist.h" -#include "volume.h" - -typedef enum { - FIRST_ZONE = 0, /* For sanity checking. */ - MFT_ZONE = 0, /* Allocate from $MFT zone. */ - DATA_ZONE = 1, /* Allocate from $DATA zone. */ - LAST_ZONE = 1, /* For sanity checking. */ -} NTFS_CLUSTER_ALLOCATION_ZONES; - -extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, - const VCN start_vcn, const s64 count, const LCN start_lcn, - const NTFS_CLUSTER_ALLOCATION_ZONES zone, - const bool is_extension); - -extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, - s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback); - -/** - * ntfs_cluster_free - free clusters on an ntfs volume - * @ni: ntfs inode whose runlist describes the clusters to free - * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters - * @count: number of clusters to free or -1 for all clusters - * @ctx: active attribute search context if present or NULL if not - * - * Free @count clusters starting at the cluster @start_vcn in the runlist - * described by the ntfs inode @ni. - * - * If @count is -1, all clusters from @start_vcn to the end of the runlist are - * deallocated. Thus, to completely free all clusters in a runlist, use - * @start_vcn = 0 and @count = -1. - * - * If @ctx is specified, it is an active search context of @ni and its base mft - * record. This is needed when ntfs_cluster_free() encounters unmapped runlist - * fragments and allows their mapping. If you do not have the mft record - * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform - * the necessary mapping and unmapping. - * - * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it - * before returning. Thus, @ctx will be left pointing to the same attribute on - * return as on entry. However, the actual pointers in @ctx may point to - * different memory locations on return, so you must remember to reset any - * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(), - * you will probably want to do: - * m = ctx->mrec; - * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove - * from the runlist or mark sparse the freed runs later. - * - * Return the number of deallocated clusters (not counting sparse ones) on - * success and -errno on error. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - The volume lcn bitmap must be unlocked on entry and is unlocked - * on return. - * - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. - */ -static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, - s64 count, ntfs_attr_search_ctx *ctx) -{ - return __ntfs_cluster_free(ni, start_vcn, count, ctx, false); -} - -extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, - const runlist_element *rl); - -/** - * ntfs_cluster_free_from_rl - free clusters from runlist - * @vol: mounted ntfs volume on which to free the clusters - * @rl: runlist describing the clusters to free - * - * Free all the clusters described by the runlist @rl on the volume @vol. In - * the case of an error being returned, at least some of the clusters were not - * freed. - * - * Return 0 on success and -errno on error. - * - * Locking: - This function takes the volume lcn bitmap lock for writing and - * modifies the bitmap contents. - * - The caller must have locked the runlist @rl for reading or - * writing. - */ -static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol, - const runlist_element *rl) -{ - int ret; - - down_write(&vol->lcnbmp_lock); - ret = ntfs_cluster_free_from_rl_nolock(vol, rl); - up_write(&vol->lcnbmp_lock); - return ret; -} - -#endif /* NTFS_RW */ - -#endif /* defined _LINUX_NTFS_LCNALLOC_H */ diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c deleted file mode 100644 index 6ce60ffc6ac0..000000000000 --- a/fs/ntfs/logfile.c +++ /dev/null @@ -1,849 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2002-2007 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/highmem.h> -#include <linux/buffer_head.h> -#include <linux/bitops.h> -#include <linux/log2.h> -#include <linux/bio.h> - -#include "attrib.h" -#include "aops.h" -#include "debug.h" -#include "logfile.h" -#include "malloc.h" -#include "volume.h" -#include "ntfs.h" - -/** - * ntfs_check_restart_page_header - check the page header for consistency - * @vi: $LogFile inode to which the restart page header belongs - * @rp: restart page header to check - * @pos: position in @vi at which the restart page header resides - * - * Check the restart page header @rp for consistency and return 'true' if it is - * consistent and 'false' otherwise. - * - * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not - * require the full restart page. - */ -static bool ntfs_check_restart_page_header(struct inode *vi, - RESTART_PAGE_HEADER *rp, s64 pos) -{ - u32 logfile_system_page_size, logfile_log_page_size; - u16 ra_ofs, usa_count, usa_ofs, usa_end = 0; - bool have_usa = true; - - ntfs_debug("Entering."); - /* - * If the system or log page sizes are smaller than the ntfs block size - * or either is not a power of 2 we cannot handle this log file. - */ - logfile_system_page_size = le32_to_cpu(rp->system_page_size); - logfile_log_page_size = le32_to_cpu(rp->log_page_size); - if (logfile_system_page_size < NTFS_BLOCK_SIZE || - logfile_log_page_size < NTFS_BLOCK_SIZE || - logfile_system_page_size & - (logfile_system_page_size - 1) || - !is_power_of_2(logfile_log_page_size)) { - ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); - return false; - } - /* - * We must be either at !pos (1st restart page) or at pos = system page - * size (2nd restart page). - */ - if (pos && pos != logfile_system_page_size) { - ntfs_error(vi->i_sb, "Found restart area in incorrect " - "position in $LogFile."); - return false; - } - /* We only know how to handle version 1.1. */ - if (sle16_to_cpu(rp->major_ver) != 1 || - sle16_to_cpu(rp->minor_ver) != 1) { - ntfs_error(vi->i_sb, "$LogFile version %i.%i is not " - "supported. (This driver supports version " - "1.1 only.)", (int)sle16_to_cpu(rp->major_ver), - (int)sle16_to_cpu(rp->minor_ver)); - return false; - } - /* - * If chkdsk has been run the restart page may not be protected by an - * update sequence array. - */ - if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) { - have_usa = false; - goto skip_usa_checks; - } - /* Verify the size of the update sequence array. */ - usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS); - if (usa_count != le16_to_cpu(rp->usa_count)) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent update sequence array count."); - return false; - } - /* Verify the position of the update sequence array. */ - usa_ofs = le16_to_cpu(rp->usa_ofs); - usa_end = usa_ofs + usa_count * sizeof(u16); - if (usa_ofs < sizeof(RESTART_PAGE_HEADER) || - usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent update sequence array offset."); - return false; - } -skip_usa_checks: - /* - * Verify the position of the restart area. It must be: - * - aligned to 8-byte boundary, - * - after the update sequence array, and - * - within the system page size. - */ - ra_ofs = le16_to_cpu(rp->restart_area_offset); - if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end : - ra_ofs < sizeof(RESTART_PAGE_HEADER)) || - ra_ofs > logfile_system_page_size) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent restart area offset."); - return false; - } - /* - * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn - * set. - */ - if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) { - ntfs_error(vi->i_sb, "$LogFile restart page is not modified " - "by chkdsk but a chkdsk LSN is specified."); - return false; - } - ntfs_debug("Done."); - return true; -} - -/** - * ntfs_check_restart_area - check the restart area for consistency - * @vi: $LogFile inode to which the restart page belongs - * @rp: restart page whose restart area to check - * - * Check the restart area of the restart page @rp for consistency and return - * 'true' if it is consistent and 'false' otherwise. - * - * This function assumes that the restart page header has already been - * consistency checked. - * - * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not - * require the full restart page. - */ -static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) -{ - u64 file_size; - RESTART_AREA *ra; - u16 ra_ofs, ra_len, ca_ofs; - u8 fs_bits; - - ntfs_debug("Entering."); - ra_ofs = le16_to_cpu(rp->restart_area_offset); - ra = (RESTART_AREA*)((u8*)rp + ra_ofs); - /* - * Everything before ra->file_size must be before the first word - * protected by an update sequence number. This ensures that it is - * safe to access ra->client_array_offset. - */ - if (ra_ofs + offsetof(RESTART_AREA, file_size) > - NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent file offset."); - return false; - } - /* - * Now that we can access ra->client_array_offset, make sure everything - * up to the log client array is before the first word protected by an - * update sequence number. This ensures we can access all of the - * restart area elements safely. Also, the client array offset must be - * aligned to an 8-byte boundary. - */ - ca_ofs = le16_to_cpu(ra->client_array_offset); - if (((ca_ofs + 7) & ~7) != ca_ofs || - ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent client array offset."); - return false; - } - /* - * The restart area must end within the system page size both when - * calculated manually and as specified by ra->restart_area_length. - * Also, the calculated length must not exceed the specified length. - */ - ra_len = ca_ofs + le16_to_cpu(ra->log_clients) * - sizeof(LOG_CLIENT_RECORD); - if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) || - ra_ofs + le16_to_cpu(ra->restart_area_length) > - le32_to_cpu(rp->system_page_size) || - ra_len > le16_to_cpu(ra->restart_area_length)) { - ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds " - "of the system page size specified by the " - "restart page header and/or the specified " - "restart area length is inconsistent."); - return false; - } - /* - * The ra->client_free_list and ra->client_in_use_list must be either - * LOGFILE_NO_CLIENT or less than ra->log_clients or they are - * overflowing the client array. - */ - if ((ra->client_free_list != LOGFILE_NO_CLIENT && - le16_to_cpu(ra->client_free_list) >= - le16_to_cpu(ra->log_clients)) || - (ra->client_in_use_list != LOGFILE_NO_CLIENT && - le16_to_cpu(ra->client_in_use_list) >= - le16_to_cpu(ra->log_clients))) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "overflowing client free and/or in use lists."); - return false; - } - /* - * Check ra->seq_number_bits against ra->file_size for consistency. - * We cannot just use ffs() because the file size is not a power of 2. - */ - file_size = (u64)sle64_to_cpu(ra->file_size); - fs_bits = 0; - while (file_size) { - file_size >>= 1; - fs_bits++; - } - if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent sequence number bits."); - return false; - } - /* The log record header length must be a multiple of 8. */ - if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) != - le16_to_cpu(ra->log_record_header_length)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent log record header length."); - return false; - } - /* Dito for the log page data offset. */ - if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) != - le16_to_cpu(ra->log_page_data_offset)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent log page data offset."); - return false; - } - ntfs_debug("Done."); - return true; -} - -/** - * ntfs_check_log_client_array - check the log client array for consistency - * @vi: $LogFile inode to which the restart page belongs - * @rp: restart page whose log client array to check - * - * Check the log client array of the restart page @rp for consistency and - * return 'true' if it is consistent and 'false' otherwise. - * - * This function assumes that the restart page header and the restart area have - * already been consistency checked. - * - * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this - * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full - * restart page and the page must be multi sector transfer deprotected. - */ -static bool ntfs_check_log_client_array(struct inode *vi, - RESTART_PAGE_HEADER *rp) -{ - RESTART_AREA *ra; - LOG_CLIENT_RECORD *ca, *cr; - u16 nr_clients, idx; - bool in_free_list, idx_is_first; - - ntfs_debug("Entering."); - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - ca = (LOG_CLIENT_RECORD*)((u8*)ra + - le16_to_cpu(ra->client_array_offset)); - /* - * Check the ra->client_free_list first and then check the - * ra->client_in_use_list. Check each of the log client records in - * each of the lists and check that the array does not overflow the - * ra->log_clients value. Also keep track of the number of records - * visited as there cannot be more than ra->log_clients records and - * that way we detect eventual loops in within a list. - */ - nr_clients = le16_to_cpu(ra->log_clients); - idx = le16_to_cpu(ra->client_free_list); - in_free_list = true; -check_list: - for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--, - idx = le16_to_cpu(cr->next_client)) { - if (!nr_clients || idx >= le16_to_cpu(ra->log_clients)) - goto err_out; - /* Set @cr to the current log client record. */ - cr = ca + idx; - /* The first log client record must not have a prev_client. */ - if (idx_is_first) { - if (cr->prev_client != LOGFILE_NO_CLIENT) - goto err_out; - idx_is_first = false; - } - } - /* Switch to and check the in use list if we just did the free list. */ - if (in_free_list) { - in_free_list = false; - idx = le16_to_cpu(ra->client_in_use_list); - goto check_list; - } - ntfs_debug("Done."); - return true; -err_out: - ntfs_error(vi->i_sb, "$LogFile log client array is corrupt."); - return false; -} - -/** - * ntfs_check_and_load_restart_page - check the restart page for consistency - * @vi: $LogFile inode to which the restart page belongs - * @rp: restart page to check - * @pos: position in @vi at which the restart page resides - * @wrp: [OUT] copy of the multi sector transfer deprotected restart page - * @lsn: [OUT] set to the current logfile lsn on success - * - * Check the restart page @rp for consistency and return 0 if it is consistent - * and -errno otherwise. The restart page may have been modified by chkdsk in - * which case its magic is CHKD instead of RSTR. - * - * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not - * require the full restart page. - * - * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a - * copy of the complete multi sector transfer deprotected page. On failure, - * *@wrp is undefined. - * - * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current - * logfile lsn according to this restart page. On failure, *@lsn is undefined. - * - * The following error codes are defined: - * -EINVAL - The restart page is inconsistent. - * -ENOMEM - Not enough memory to load the restart page. - * -EIO - Failed to reading from $LogFile. - */ -static int ntfs_check_and_load_restart_page(struct inode *vi, - RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp, - LSN *lsn) -{ - RESTART_AREA *ra; - RESTART_PAGE_HEADER *trp; - int size, err; - - ntfs_debug("Entering."); - /* Check the restart page header for consistency. */ - if (!ntfs_check_restart_page_header(vi, rp, pos)) { - /* Error output already done inside the function. */ - return -EINVAL; - } - /* Check the restart area for consistency. */ - if (!ntfs_check_restart_area(vi, rp)) { - /* Error output already done inside the function. */ - return -EINVAL; - } - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - /* - * Allocate a buffer to store the whole restart page so we can multi - * sector transfer deprotect it. - */ - trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size)); - if (!trp) { - ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile " - "restart page buffer."); - return -ENOMEM; - } - /* - * Read the whole of the restart page into the buffer. If it fits - * completely inside @rp, just copy it from there. Otherwise map all - * the required pages and copy the data from them. - */ - size = PAGE_SIZE - (pos & ~PAGE_MASK); - if (size >= le32_to_cpu(rp->system_page_size)) { - memcpy(trp, rp, le32_to_cpu(rp->system_page_size)); - } else { - pgoff_t idx; - struct page *page; - int have_read, to_read; - - /* First copy what we already have in @rp. */ - memcpy(trp, rp, size); - /* Copy the remaining data one page at a time. */ - have_read = size; - to_read = le32_to_cpu(rp->system_page_size) - size; - idx = (pos + size) >> PAGE_SHIFT; - BUG_ON((pos + size) & ~PAGE_MASK); - do { - page = ntfs_map_page(vi->i_mapping, idx); - if (IS_ERR(page)) { - ntfs_error(vi->i_sb, "Error mapping $LogFile " - "page (index %lu).", idx); - err = PTR_ERR(page); - if (err != -EIO && err != -ENOMEM) - err = -EIO; - goto err_out; - } - size = min_t(int, to_read, PAGE_SIZE); - memcpy((u8*)trp + have_read, page_address(page), size); - ntfs_unmap_page(page); - have_read += size; - to_read -= size; - idx++; - } while (to_read > 0); - } - /* - * Perform the multi sector transfer deprotection on the buffer if the - * restart page is protected. - */ - if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count)) - && post_read_mst_fixup((NTFS_RECORD*)trp, - le32_to_cpu(rp->system_page_size))) { - /* - * A multi sector tranfer error was detected. We only need to - * abort if the restart page contents exceed the multi sector - * transfer fixup of the first sector. - */ - if (le16_to_cpu(rp->restart_area_offset) + - le16_to_cpu(ra->restart_area_length) > - NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "Multi sector transfer error " - "detected in $LogFile restart page."); - err = -EINVAL; - goto err_out; - } - } - /* - * If the restart page is modified by chkdsk or there are no active - * logfile clients, the logfile is consistent. Otherwise, need to - * check the log client records for consistency, too. - */ - err = 0; - if (ntfs_is_rstr_record(rp->magic) && - ra->client_in_use_list != LOGFILE_NO_CLIENT) { - if (!ntfs_check_log_client_array(vi, trp)) { - err = -EINVAL; - goto err_out; - } - } - if (lsn) { - if (ntfs_is_rstr_record(rp->magic)) - *lsn = sle64_to_cpu(ra->current_lsn); - else /* if (ntfs_is_chkd_record(rp->magic)) */ - *lsn = sle64_to_cpu(rp->chkdsk_lsn); - } - ntfs_debug("Done."); - if (wrp) - *wrp = trp; - else { -err_out: - ntfs_free(trp); - } - return err; -} - -/** - * ntfs_check_logfile - check the journal for consistency - * @log_vi: struct inode of loaded journal $LogFile to check - * @rp: [OUT] on success this is a copy of the current restart page - * - * Check the $LogFile journal for consistency and return 'true' if it is - * consistent and 'false' if not. On success, the current restart page is - * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it. - * - * At present we only check the two restart pages and ignore the log record - * pages. - * - * Note that the MstProtected flag is not set on the $LogFile inode and hence - * when reading pages they are not deprotected. This is because we do not know - * if the $LogFile was created on a system with a different page size to ours - * yet and mst deprotection would fail if our page size is smaller. - */ -bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) -{ - s64 size, pos; - LSN rstr1_lsn, rstr2_lsn; - ntfs_volume *vol = NTFS_SB(log_vi->i_sb); - struct address_space *mapping = log_vi->i_mapping; - struct page *page = NULL; - u8 *kaddr = NULL; - RESTART_PAGE_HEADER *rstr1_ph = NULL; - RESTART_PAGE_HEADER *rstr2_ph = NULL; - int log_page_size, err; - bool logfile_is_empty = true; - u8 log_page_bits; - - ntfs_debug("Entering."); - /* An empty $LogFile must have been clean before it got emptied. */ - if (NVolLogFileEmpty(vol)) - goto is_empty; - size = i_size_read(log_vi); - /* Make sure the file doesn't exceed the maximum allowed size. */ - if (size > MaxLogFileSize) - size = MaxLogFileSize; - /* - * Truncate size to a multiple of the page cache size or the default - * log page size if the page cache size is between the default log page - * log page size if the page cache size is between the default log page - * size and twice that. - */ - if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= - DefaultLogPageSize * 2) - log_page_size = DefaultLogPageSize; - else - log_page_size = PAGE_SIZE; - /* - * Use ntfs_ffs() instead of ffs() to enable the compiler to - * optimize log_page_size and log_page_bits into constants. - */ - log_page_bits = ntfs_ffs(log_page_size) - 1; - size &= ~(s64)(log_page_size - 1); - /* - * Ensure the log file is big enough to store at least the two restart - * pages and the minimum number of log record pages. - */ - if (size < log_page_size * 2 || (size - log_page_size * 2) >> - log_page_bits < MinLogRecordPages) { - ntfs_error(vol->sb, "$LogFile is too small."); - return false; - } - /* - * Read through the file looking for a restart page. Since the restart - * page header is at the beginning of a page we only need to search at - * what could be the beginning of a page (for each page size) rather - * than scanning the whole file byte by byte. If all potential places - * contain empty and uninitialzed records, the log file can be assumed - * to be empty. - */ - for (pos = 0; pos < size; pos <<= 1) { - pgoff_t idx = pos >> PAGE_SHIFT; - if (!page || page->index != idx) { - if (page) - ntfs_unmap_page(page); - page = ntfs_map_page(mapping, idx); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Error mapping $LogFile " - "page (index %lu).", idx); - goto err_out; - } - } - kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK); - /* - * A non-empty block means the logfile is not empty while an - * empty block after a non-empty block has been encountered - * means we are done. - */ - if (!ntfs_is_empty_recordp((le32*)kaddr)) - logfile_is_empty = false; - else if (!logfile_is_empty) - break; - /* - * A log record page means there cannot be a restart page after - * this so no need to continue searching. - */ - if (ntfs_is_rcrd_recordp((le32*)kaddr)) - break; - /* If not a (modified by chkdsk) restart page, continue. */ - if (!ntfs_is_rstr_recordp((le32*)kaddr) && - !ntfs_is_chkd_recordp((le32*)kaddr)) { - if (!pos) - pos = NTFS_BLOCK_SIZE >> 1; - continue; - } - /* - * Check the (modified by chkdsk) restart page for consistency - * and get a copy of the complete multi sector transfer - * deprotected restart page. - */ - err = ntfs_check_and_load_restart_page(log_vi, - (RESTART_PAGE_HEADER*)kaddr, pos, - !rstr1_ph ? &rstr1_ph : &rstr2_ph, - !rstr1_ph ? &rstr1_lsn : &rstr2_lsn); - if (!err) { - /* - * If we have now found the first (modified by chkdsk) - * restart page, continue looking for the second one. - */ - if (!pos) { - pos = NTFS_BLOCK_SIZE >> 1; - continue; - } - /* - * We have now found the second (modified by chkdsk) - * restart page, so we can stop looking. - */ - break; - } - /* - * Error output already done inside the function. Note, we do - * not abort if the restart page was invalid as we might still - * find a valid one further in the file. - */ - if (err != -EINVAL) { - ntfs_unmap_page(page); - goto err_out; - } - /* Continue looking. */ - if (!pos) - pos = NTFS_BLOCK_SIZE >> 1; - } - if (page) - ntfs_unmap_page(page); - if (logfile_is_empty) { - NVolSetLogFileEmpty(vol); -is_empty: - ntfs_debug("Done. ($LogFile is empty.)"); - return true; - } - if (!rstr1_ph) { - BUG_ON(rstr2_ph); - ntfs_error(vol->sb, "Did not find any restart pages in " - "$LogFile and it was not empty."); - return false; - } - /* If both restart pages were found, use the more recent one. */ - if (rstr2_ph) { - /* - * If the second restart area is more recent, switch to it. - * Otherwise just throw it away. - */ - if (rstr2_lsn > rstr1_lsn) { - ntfs_debug("Using second restart page as it is more " - "recent."); - ntfs_free(rstr1_ph); - rstr1_ph = rstr2_ph; - /* rstr1_lsn = rstr2_lsn; */ - } else { - ntfs_debug("Using first restart page as it is more " - "recent."); - ntfs_free(rstr2_ph); - } - rstr2_ph = NULL; - } - /* All consistency checks passed. */ - if (rp) - *rp = rstr1_ph; - else - ntfs_free(rstr1_ph); - ntfs_debug("Done."); - return true; -err_out: - if (rstr1_ph) - ntfs_free(rstr1_ph); - return false; -} - -/** - * ntfs_is_logfile_clean - check in the journal if the volume is clean - * @log_vi: struct inode of loaded journal $LogFile to check - * @rp: copy of the current restart page - * - * Analyze the $LogFile journal and return 'true' if it indicates the volume was - * shutdown cleanly and 'false' if not. - * - * At present we only look at the two restart pages and ignore the log record - * pages. This is a little bit crude in that there will be a very small number - * of cases where we think that a volume is dirty when in fact it is clean. - * This should only affect volumes that have not been shutdown cleanly but did - * not have any pending, non-check-pointed i/o, i.e. they were completely idle - * at least for the five seconds preceding the unclean shutdown. - * - * This function assumes that the $LogFile journal has already been consistency - * checked by a call to ntfs_check_logfile() and in particular if the $LogFile - * is empty this function requires that NVolLogFileEmpty() is true otherwise an - * empty volume will be reported as dirty. - */ -bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp) -{ - ntfs_volume *vol = NTFS_SB(log_vi->i_sb); - RESTART_AREA *ra; - - ntfs_debug("Entering."); - /* An empty $LogFile must have been clean before it got emptied. */ - if (NVolLogFileEmpty(vol)) { - ntfs_debug("Done. ($LogFile is empty.)"); - return true; - } - BUG_ON(!rp); - if (!ntfs_is_rstr_record(rp->magic) && - !ntfs_is_chkd_record(rp->magic)) { - ntfs_error(vol->sb, "Restart page buffer is invalid. This is " - "probably a bug in that the $LogFile should " - "have been consistency checked before calling " - "this function."); - return false; - } - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - /* - * If the $LogFile has active clients, i.e. it is open, and we do not - * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags, - * we assume there was an unclean shutdown. - */ - if (ra->client_in_use_list != LOGFILE_NO_CLIENT && - !(ra->flags & RESTART_VOLUME_IS_CLEAN)) { - ntfs_debug("Done. $LogFile indicates a dirty shutdown."); - return false; - } - /* $LogFile indicates a clean shutdown. */ - ntfs_debug("Done. $LogFile indicates a clean shutdown."); - return true; -} - -/** - * ntfs_empty_logfile - empty the contents of the $LogFile journal - * @log_vi: struct inode of loaded journal $LogFile to empty - * - * Empty the contents of the $LogFile journal @log_vi and return 'true' on - * success and 'false' on error. - * - * This function assumes that the $LogFile journal has already been consistency - * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean() - * has been used to ensure that the $LogFile is clean. - */ -bool ntfs_empty_logfile(struct inode *log_vi) -{ - VCN vcn, end_vcn; - ntfs_inode *log_ni = NTFS_I(log_vi); - ntfs_volume *vol = log_ni->vol; - struct super_block *sb = vol->sb; - runlist_element *rl; - unsigned long flags; - unsigned block_size, block_size_bits; - int err; - bool should_wait = true; - - ntfs_debug("Entering."); - if (NVolLogFileEmpty(vol)) { - ntfs_debug("Done."); - return true; - } - /* - * We cannot use ntfs_attr_set() because we may be still in the middle - * of a mount operation. Thus we do the emptying by hand by first - * zapping the page cache pages for the $LogFile/$DATA attribute and - * then emptying each of the buffers in each of the clusters specified - * by the runlist by hand. - */ - block_size = sb->s_blocksize; - block_size_bits = sb->s_blocksize_bits; - vcn = 0; - read_lock_irqsave(&log_ni->size_lock, flags); - end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >> - vol->cluster_size_bits; - read_unlock_irqrestore(&log_ni->size_lock, flags); - truncate_inode_pages(log_vi->i_mapping, 0); - down_write(&log_ni->runlist.lock); - rl = log_ni->runlist.rl; - if (unlikely(!rl || vcn < rl->vcn || !rl->length)) { -map_vcn: - err = ntfs_map_runlist_nolock(log_ni, vcn, NULL); - if (err) { - ntfs_error(sb, "Failed to map runlist fragment (error " - "%d).", -err); - goto err; - } - rl = log_ni->runlist.rl; - BUG_ON(!rl || vcn < rl->vcn || !rl->length); - } - /* Seek to the runlist element containing @vcn. */ - while (rl->length && vcn >= rl[1].vcn) - rl++; - do { - LCN lcn; - sector_t block, end_block; - s64 len; - - /* - * If this run is not mapped map it now and start again as the - * runlist will have been updated. - */ - lcn = rl->lcn; - if (unlikely(lcn == LCN_RL_NOT_MAPPED)) { - vcn = rl->vcn; - goto map_vcn; - } - /* If this run is not valid abort with an error. */ - if (unlikely(!rl->length || lcn < LCN_HOLE)) - goto rl_err; - /* Skip holes. */ - if (lcn == LCN_HOLE) - continue; - block = lcn << vol->cluster_size_bits >> block_size_bits; - len = rl->length; - if (rl[1].vcn > end_vcn) - len = end_vcn - rl->vcn; - end_block = (lcn + len) << vol->cluster_size_bits >> - block_size_bits; - /* Iterate over the blocks in the run and empty them. */ - do { - struct buffer_head *bh; - - /* Obtain the buffer, possibly not uptodate. */ - bh = sb_getblk(sb, block); - BUG_ON(!bh); - /* Setup buffer i/o submission. */ - lock_buffer(bh); - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - /* Set the entire contents of the buffer to 0xff. */ - memset(bh->b_data, -1, block_size); - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - if (buffer_dirty(bh)) - clear_buffer_dirty(bh); - /* - * Submit the buffer and wait for i/o to complete but - * only for the first buffer so we do not miss really - * serious i/o errors. Once the first buffer has - * completed ignore errors afterwards as we can assume - * that if one buffer worked all of them will work. - */ - submit_bh(REQ_OP_WRITE, bh); - if (should_wait) { - should_wait = false; - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - goto io_err; - } - brelse(bh); - } while (++block < end_block); - } while ((++rl)->vcn < end_vcn); - up_write(&log_ni->runlist.lock); - /* - * Zap the pages again just in case any got instantiated whilst we were - * emptying the blocks by hand. FIXME: We may not have completed - * writing to all the buffer heads yet so this may happen too early. - * We really should use a kernel thread to do the emptying - * asynchronously and then we can also set the volume dirty and output - * an error message if emptying should fail. - */ - truncate_inode_pages(log_vi->i_mapping, 0); - /* Set the flag so we do not have to do it again on remount. */ - NVolSetLogFileEmpty(vol); - ntfs_debug("Done."); - return true; -io_err: - ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk."); - goto dirty_err; -rl_err: - ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk."); -dirty_err: - NVolSetErrors(vol); - err = -EIO; -err: - up_write(&log_ni->runlist.lock); - ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).", - -err); - return false; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h deleted file mode 100644 index 429d4909cc72..000000000000 --- a/fs/ntfs/logfile.h +++ /dev/null @@ -1,295 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of - * the Linux-NTFS project. - * - * Copyright (c) 2000-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_LOGFILE_H -#define _LINUX_NTFS_LOGFILE_H - -#ifdef NTFS_RW - -#include <linux/fs.h> - -#include "types.h" -#include "endian.h" -#include "layout.h" - -/* - * Journal ($LogFile) organization: - * - * Two restart areas present in the first two pages (restart pages, one restart - * area in each page). When the volume is dismounted they should be identical, - * except for the update sequence array which usually has a different update - * sequence number. - * - * These are followed by log records organized in pages headed by a log record - * header going up to log file size. Not all pages contain log records when a - * volume is first formatted, but as the volume ages, all records will be used. - * When the log file fills up, the records at the beginning are purged (by - * modifying the oldest_lsn to a higher value presumably) and writing begins - * at the beginning of the file. Effectively, the log file is viewed as a - * circular entity. - * - * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept - * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We - * probably only want to support 1.1 as this seems to be the current version - * and we don't know how that differs from the older versions. The only - * exception is if the journal is clean as marked by the two restart pages - * then it doesn't matter whether we are on an earlier version. We can just - * reinitialize the logfile and start again with version 1.1. - */ - -/* Some $LogFile related constants. */ -#define MaxLogFileSize 0x100000000ULL -#define DefaultLogPageSize 4096 -#define MinLogRecordPages 48 - -/* - * Log file restart page header (begins the restart area). - */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ -/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */ -/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h. - When creating, set this to be immediately - after this header structure (without any - alignment). */ -/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */ - -/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by - chkdsk. Only used when the magic is changed - to "CHKD". Otherwise this is zero. */ -/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file - was created, has to be >= 512 and a power of - 2. Use this to calculate the required size - of the usa (usa_count) and add it to usa_ofs. - Then verify that the result is less than the - value of the restart_area_offset. */ -/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >= - 512 and a power of 2. The default is 4096 - and is used when the system page size is - between 4096 and 8192. Otherwise this is - set to the system page size instead. */ -/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to - the RESTART_AREA. Value has to be aligned - to 8-byte boundary. When creating, set this - to be after the usa. */ -/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major - version is 1. */ -/* 28*/ sle16 major_ver; /* Log file major version. We only support - version 1.1. */ -/* sizeof() = 30 (0x1e) bytes */ -} __attribute__ ((__packed__)) RESTART_PAGE_HEADER; - -/* - * Constant for the log client indices meaning that there are no client records - * in this particular client array. Also inside the client records themselves, - * this means that there are no client records preceding or following this one. - */ -#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff) -#define LOGFILE_NO_CLIENT_CPU 0xffff - -/* - * These are the so far known RESTART_AREA_* flags (16-bit) which contain - * information about the log file in which they are present. - */ -enum { - RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), - RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ -} __attribute__ ((__packed__)); - -typedef le16 RESTART_AREA_FLAGS; - -/* - * Log file restart area record. The offset of this record is found by adding - * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found - * in it. See notes at restart_area_offset above. - */ -typedef struct { -/*Ofs*/ -/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log - when the restart area was last written. - This happens often but what is the interval? - Is it just fixed time or is it every time a - check point is written or somethine else? - On create set to 0. */ -/* 8*/ le16 log_clients; /* Number of log client records in the array of - log client records which follows this - restart area. Must be 1. */ -/* 10*/ le16 client_free_list; /* The index of the first free log client record - in the array of log client records. - LOGFILE_NO_CLIENT means that there are no - free log client records in the array. - If != LOGFILE_NO_CLIENT, check that - log_clients > client_free_list. On Win2k - and presumably earlier, on a clean volume - this is != LOGFILE_NO_CLIENT, and it should - be 0, i.e. the first (and only) client - record is free and thus the logfile is - closed and hence clean. A dirty volume - would have left the logfile open and hence - this would be LOGFILE_NO_CLIENT. On WinXP - and presumably later, the logfile is always - open, even on clean shutdown so this should - always be LOGFILE_NO_CLIENT. */ -/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client - record in the array of log client records. - LOGFILE_NO_CLIENT means that there are no - in-use log client records in the array. If - != LOGFILE_NO_CLIENT check that log_clients - > client_in_use_list. On Win2k and - presumably earlier, on a clean volume this - is LOGFILE_NO_CLIENT, i.e. there are no - client records in use and thus the logfile - is closed and hence clean. A dirty volume - would have left the logfile open and hence - this would be != LOGFILE_NO_CLIENT, and it - should be 0, i.e. the first (and only) - client record is in use. On WinXP and - presumably later, the logfile is always - open, even on clean shutdown so this should - always be 0. */ -/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k - and presumably earlier this is always 0. On - WinXP and presumably later, if the logfile - was shutdown cleanly, the second bit, - RESTART_VOLUME_IS_CLEAN, is set. This bit - is cleared when the volume is mounted by - WinXP and set when the volume is dismounted, - thus if the logfile is dirty, this bit is - clear. Thus we don't need to check the - Windows version to determine if the logfile - is clean. Instead if the logfile is closed, - we know it must be clean. If it is open and - this bit is set, we also know it must be - clean. If on the other hand the logfile is - open and this bit is clear, we can be almost - certain that the logfile is dirty. */ -/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence - number. This is calculated as 67 - the - number of bits required to store the logfile - size in bytes and this can be used in with - the specified file_size as a consistency - check. */ -/* 20*/ le16 restart_area_length;/* Length of the restart area including the - client array. Following checks required if - version matches. Otherwise, skip them. - restart_area_offset + restart_area_length - has to be <= system_page_size. Also, - restart_area_length has to be >= - client_array_offset + (log_clients * - sizeof(log client record)). */ -/* 22*/ le16 client_array_offset;/* Offset from the start of this record to - the first log client record if versions are - matched. When creating, set this to be - after this restart area structure, aligned - to 8-bytes boundary. If the versions do not - match, this is ignored and the offset is - assumed to be (sizeof(RESTART_AREA) + 7) & - ~7, i.e. rounded up to first 8-byte - boundary. Either way, client_array_offset - has to be aligned to an 8-byte boundary. - Also, restart_area_offset + - client_array_offset has to be <= 510. - Finally, client_array_offset + (log_clients - * sizeof(log client record)) has to be <= - system_page_size. On Win2k and presumably - earlier, this is 0x30, i.e. immediately - following this record. On WinXP and - presumably later, this is 0x40, i.e. there - are 16 extra bytes between this record and - the client array. This probably means that - the RESTART_AREA record is actually bigger - in WinXP and later. */ -/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the - restart_area_offset + the offset of the - file_size are > 510 then corruption has - occurred. This is the very first check when - starting with the restart_area as if it - fails it means that some of the above values - will be corrupted by the multi sector - transfer protection. The file_size has to - be rounded down to be a multiple of the - log_page_size in the RESTART_PAGE_HEADER and - then it has to be at least big enough to - store the two restart pages and 48 (0x30) - log record pages. */ -/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including - the log record header. On create set to - 0. */ -/* 36*/ le16 log_record_header_length;/* Byte size of the log record header. - If the version matches then check that the - value of log_record_header_length is a - multiple of 8, i.e. - (log_record_header_length + 7) & ~7 == - log_record_header_length. When creating set - it to sizeof(LOG_RECORD_HEADER), aligned to - 8 bytes. */ -/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record - page. Must be a multiple of 8. On create - set it to immediately after the update - sequence array of the log record page. */ -/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every - time the logfile is restarted which happens - at mount time when the logfile is opened. - When creating set to a random value. Win2k - sets it to the low 32 bits of the current - system time in NTFS format (see time.h). */ -/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */ -/* sizeof() = 48 (0x30) bytes */ -} __attribute__ ((__packed__)) RESTART_AREA; - -/* - * Log client record. The offset of this record is found by adding the offset - * of the RESTART_AREA to the client_array_offset value found in it. - */ -typedef struct { -/*Ofs*/ -/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create - set to 0. */ -/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart - the volume, i.e. the current position within - the log file. At present, if clean this - should = current_lsn in restart area but it - probably also = current_lsn when dirty most - of the time. At create set to 0. */ -/* 16*/ le16 prev_client; /* The offset to the previous log client record - in the array of log client records. - LOGFILE_NO_CLIENT means there is no previous - client record, i.e. this is the first one. - This is always LOGFILE_NO_CLIENT. */ -/* 18*/ le16 next_client; /* The offset to the next log client record in - the array of log client records. - LOGFILE_NO_CLIENT means there are no next - client records, i.e. this is the last one. - This is always LOGFILE_NO_CLIENT. */ -/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set - to zero every time the logfile is restarted - and it is incremented when the logfile is - closed at dismount time. Thus it is 0 when - dirty and 1 when clean. On WinXP and - presumably later, this is always 0. */ -/* 22*/ u8 reserved[6]; /* Reserved/alignment. */ -/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should - always be 8. */ -/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should - always be "NTFS" with the remaining bytes - set to 0. */ -/* sizeof() = 160 (0xa0) bytes */ -} __attribute__ ((__packed__)) LOG_CLIENT_RECORD; - -extern bool ntfs_check_logfile(struct inode *log_vi, - RESTART_PAGE_HEADER **rp); - -extern bool ntfs_is_logfile_clean(struct inode *log_vi, - const RESTART_PAGE_HEADER *rp); - -extern bool ntfs_empty_logfile(struct inode *log_vi); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_LOGFILE_H */ diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h deleted file mode 100644 index 7068425735f1..000000000000 --- a/fs/ntfs/malloc.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_MALLOC_H -#define _LINUX_NTFS_MALLOC_H - -#include <linux/vmalloc.h> -#include <linux/slab.h> -#include <linux/highmem.h> - -/** - * __ntfs_malloc - allocate memory in multiples of pages - * @size: number of bytes to allocate - * @gfp_mask: extra flags for the allocator - * - * Internal function. You probably want ntfs_malloc_nofs()... - * - * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and - * returns a pointer to the allocated memory. - * - * If there was insufficient memory to complete the request, return NULL. - * Depending on @gfp_mask the allocation may be guaranteed to succeed. - */ -static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) -{ - if (likely(size <= PAGE_SIZE)) { - BUG_ON(!size); - /* kmalloc() has per-CPU caches so is faster for now. */ - return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); - /* return (void *)__get_free_page(gfp_mask); */ - } - if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask); - return NULL; -} - -/** - * ntfs_malloc_nofs - allocate memory in multiples of pages - * @size: number of bytes to allocate - * - * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and - * returns a pointer to the allocated memory. - * - * If there was insufficient memory to complete the request, return NULL. - */ -static inline void *ntfs_malloc_nofs(unsigned long size) -{ - return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM); -} - -/** - * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages - * @size: number of bytes to allocate - * - * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and - * returns a pointer to the allocated memory. - * - * This function guarantees that the allocation will succeed. It will sleep - * for as long as it takes to complete the allocation. - * - * If there was insufficient memory to complete the request, return NULL. - */ -static inline void *ntfs_malloc_nofs_nofail(unsigned long size) -{ - return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL); -} - -static inline void ntfs_free(void *addr) -{ - kvfree(addr); -} - -#endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c deleted file mode 100644 index 6fd1dc4b08c8..000000000000 --- a/fs/ntfs/mft.c +++ /dev/null @@ -1,2907 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2002 Richard Russon - */ - -#include <linux/buffer_head.h> -#include <linux/slab.h> -#include <linux/swap.h> -#include <linux/bio.h> - -#include "attrib.h" -#include "aops.h" -#include "bitmap.h" -#include "debug.h" -#include "dir.h" -#include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" -#include "ntfs.h" - -#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE) - -/** - * map_mft_record_page - map the page in which a specific mft record resides - * @ni: ntfs inode whose mft record page to map - * - * This maps the page in which the mft record of the ntfs inode @ni is situated - * and returns a pointer to the mft record within the mapped page. - * - * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR() - * contains the negative error code returned. - */ -static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) -{ - loff_t i_size; - ntfs_volume *vol = ni->vol; - struct inode *mft_vi = vol->mft_ino; - struct page *page; - unsigned long index, end_index; - unsigned ofs; - - BUG_ON(ni->page); - /* - * The index into the page cache and the offset within the page cache - * page of the wanted mft record. FIXME: We need to check for - * overflowing the unsigned long, but I don't think we would ever get - * here if the volume was that big... - */ - index = (u64)ni->mft_no << vol->mft_record_size_bits >> - PAGE_SHIFT; - ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; - - i_size = i_size_read(mft_vi); - /* The maximum valid index into the page cache for $MFT's data. */ - end_index = i_size >> PAGE_SHIFT; - - /* If the wanted index is out of bounds the mft record doesn't exist. */ - if (unlikely(index >= end_index)) { - if (index > end_index || (i_size & ~PAGE_MASK) < ofs + - vol->mft_record_size) { - page = ERR_PTR(-ENOENT); - ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, " - "which is beyond the end of the mft. " - "This is probably a bug in the ntfs " - "driver.", ni->mft_no); - goto err_out; - } - } - /* Read, map, and pin the page. */ - page = ntfs_map_page(mft_vi->i_mapping, index); - if (!IS_ERR(page)) { - /* Catch multi sector transfer fixup errors. */ - if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + - ofs)))) { - ni->page = page; - ni->page_ofs = ofs; - return page_address(page) + ofs; - } - ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. " - "Run chkdsk.", ni->mft_no); - ntfs_unmap_page(page); - page = ERR_PTR(-EIO); - NVolSetErrors(vol); - } -err_out: - ni->page = NULL; - ni->page_ofs = 0; - return (void*)page; -} - -/** - * map_mft_record - map, pin and lock an mft record - * @ni: ntfs inode whose MFT record to map - * - * First, take the mrec_lock mutex. We might now be sleeping, while waiting - * for the mutex if it was already locked by someone else. - * - * The page of the record is mapped using map_mft_record_page() before being - * returned to the caller. - * - * This in turn uses ntfs_map_page() to get the page containing the wanted mft - * record (it in turn calls read_cache_page() which reads it in from disk if - * necessary, increments the use count on the page so that it cannot disappear - * under us and returns a reference to the page cache page). - * - * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it - * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed - * and the post-read mst fixups on each mft record in the page have been - * performed, the page gets PG_uptodate set and PG_locked cleared (this is done - * in our asynchronous I/O completion handler end_buffer_read_mft_async()). - * ntfs_map_page() waits for PG_locked to become clear and checks if - * PG_uptodate is set and returns an error code if not. This provides - * sufficient protection against races when reading/using the page. - * - * However there is the write mapping to think about. Doing the above described - * checking here will be fine, because when initiating the write we will set - * PG_locked and clear PG_uptodate making sure nobody is touching the page - * contents. Doing the locking this way means that the commit to disk code in - * the page cache code paths is automatically sufficiently locked with us as - * we will not touch a page that has been locked or is not uptodate. The only - * locking problem then is them locking the page while we are accessing it. - * - * So that code will end up having to own the mrec_lock of all mft - * records/inodes present in the page before I/O can proceed. In that case we - * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be - * accessing anything without owning the mrec_lock mutex. But we do need to - * use them because of the read_cache_page() invocation and the code becomes so - * much simpler this way that it is well worth it. - * - * The mft record is now ours and we return a pointer to it. You need to check - * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return - * the error code. - * - * NOTE: Caller is responsible for setting the mft record dirty before calling - * unmap_mft_record(). This is obviously only necessary if the caller really - * modified the mft record... - * Q: Do we want to recycle one of the VFS inode state bits instead? - * A: No, the inode ones mean we want to change the mft record, not we want to - * write it out. - */ -MFT_RECORD *map_mft_record(ntfs_inode *ni) -{ - MFT_RECORD *m; - - ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); - - /* Make sure the ntfs inode doesn't go away. */ - atomic_inc(&ni->count); - - /* Serialize access to this mft record. */ - mutex_lock(&ni->mrec_lock); - - m = map_mft_record_page(ni); - if (!IS_ERR(m)) - return m; - - mutex_unlock(&ni->mrec_lock); - atomic_dec(&ni->count); - ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m)); - return m; -} - -/** - * unmap_mft_record_page - unmap the page in which a specific mft record resides - * @ni: ntfs inode whose mft record page to unmap - * - * This unmaps the page in which the mft record of the ntfs inode @ni is - * situated and returns. This is a NOOP if highmem is not configured. - * - * The unmap happens via ntfs_unmap_page() which in turn decrements the use - * count on the page thus releasing it from the pinned state. - * - * We do not actually unmap the page from memory of course, as that will be - * done by the page cache code itself when memory pressure increases or - * whatever. - */ -static inline void unmap_mft_record_page(ntfs_inode *ni) -{ - BUG_ON(!ni->page); - - // TODO: If dirty, blah... - ntfs_unmap_page(ni->page); - ni->page = NULL; - ni->page_ofs = 0; - return; -} - -/** - * unmap_mft_record - release a mapped mft record - * @ni: ntfs inode whose MFT record to unmap - * - * We release the page mapping and the mrec_lock mutex which unmaps the mft - * record and releases it for others to get hold of. We also release the ntfs - * inode by decrementing the ntfs inode reference count. - * - * NOTE: If caller has modified the mft record, it is imperative to set the mft - * record dirty BEFORE calling unmap_mft_record(). - */ -void unmap_mft_record(ntfs_inode *ni) -{ - struct page *page = ni->page; - - BUG_ON(!page); - - ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); - - unmap_mft_record_page(ni); - mutex_unlock(&ni->mrec_lock); - atomic_dec(&ni->count); - /* - * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to - * ntfs_clear_extent_inode() in the extent inode case, and to the - * caller in the non-extent, yet pure ntfs inode case, to do the actual - * tear down of all structures and freeing of all allocated memory. - */ - return; -} - -/** - * map_extent_mft_record - load an extent inode and attach it to its base - * @base_ni: base ntfs inode - * @mref: mft reference of the extent inode to load - * @ntfs_ino: on successful return, pointer to the ntfs_inode structure - * - * Load the extent mft record @mref and attach it to its base inode @base_ni. - * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise - * PTR_ERR(result) gives the negative error code. - * - * On successful return, @ntfs_ino contains a pointer to the ntfs_inode - * structure of the mapped extent inode. - */ -MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, - ntfs_inode **ntfs_ino) -{ - MFT_RECORD *m; - ntfs_inode *ni = NULL; - ntfs_inode **extent_nis = NULL; - int i; - unsigned long mft_no = MREF(mref); - u16 seq_no = MSEQNO(mref); - bool destroy_ni = false; - - ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).", - mft_no, base_ni->mft_no); - /* Make sure the base ntfs inode doesn't go away. */ - atomic_inc(&base_ni->count); - /* - * Check if this extent inode has already been added to the base inode, - * in which case just return it. If not found, add it to the base - * inode before returning it. - */ - mutex_lock(&base_ni->extent_lock); - if (base_ni->nr_extents > 0) { - extent_nis = base_ni->ext.extent_ntfs_inos; - for (i = 0; i < base_ni->nr_extents; i++) { - if (mft_no != extent_nis[i]->mft_no) - continue; - ni = extent_nis[i]; - /* Make sure the ntfs inode doesn't go away. */ - atomic_inc(&ni->count); - break; - } - } - if (likely(ni != NULL)) { - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - /* We found the record; just have to map and return it. */ - m = map_mft_record(ni); - /* map_mft_record() has incremented this on success. */ - atomic_dec(&ni->count); - if (!IS_ERR(m)) { - /* Verify the sequence number. */ - if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { - ntfs_debug("Done 1."); - *ntfs_ino = ni; - return m; - } - unmap_mft_record(ni); - ntfs_error(base_ni->vol->sb, "Found stale extent mft " - "reference! Corrupt filesystem. " - "Run chkdsk."); - return ERR_PTR(-EIO); - } -map_err_out: - ntfs_error(base_ni->vol->sb, "Failed to map extent " - "mft record, error code %ld.", -PTR_ERR(m)); - return m; - } - /* Record wasn't there. Get a new ntfs inode and initialize it. */ - ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no); - if (unlikely(!ni)) { - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - return ERR_PTR(-ENOMEM); - } - ni->vol = base_ni->vol; - ni->seq_no = seq_no; - ni->nr_extents = -1; - ni->ext.base_ntfs_ino = base_ni; - /* Now map the record. */ - m = map_mft_record(ni); - if (IS_ERR(m)) { - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - ntfs_clear_extent_inode(ni); - goto map_err_out; - } - /* Verify the sequence number if it is present. */ - if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) { - ntfs_error(base_ni->vol->sb, "Found stale extent mft " - "reference! Corrupt filesystem. Run chkdsk."); - destroy_ni = true; - m = ERR_PTR(-EIO); - goto unm_err_out; - } - /* Attach extent inode to base inode, reallocating memory if needed. */ - if (!(base_ni->nr_extents & 3)) { - ntfs_inode **tmp; - int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *); - - tmp = kmalloc(new_size, GFP_NOFS); - if (unlikely(!tmp)) { - ntfs_error(base_ni->vol->sb, "Failed to allocate " - "internal buffer."); - destroy_ni = true; - m = ERR_PTR(-ENOMEM); - goto unm_err_out; - } - if (base_ni->nr_extents) { - BUG_ON(!base_ni->ext.extent_ntfs_inos); - memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size - - 4 * sizeof(ntfs_inode *)); - kfree(base_ni->ext.extent_ntfs_inos); - } - base_ni->ext.extent_ntfs_inos = tmp; - } - base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni; - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - ntfs_debug("Done 2."); - *ntfs_ino = ni; - return m; -unm_err_out: - unmap_mft_record(ni); - mutex_unlock(&base_ni->extent_lock); - atomic_dec(&base_ni->count); - /* - * If the extent inode was not attached to the base inode we need to - * release it or we will leak memory. - */ - if (destroy_ni) - ntfs_clear_extent_inode(ni); - return m; -} - -#ifdef NTFS_RW - -/** - * __mark_mft_record_dirty - set the mft record and the page containing it dirty - * @ni: ntfs inode describing the mapped mft record - * - * Internal function. Users should call mark_mft_record_dirty() instead. - * - * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, - * as well as the page containing the mft record, dirty. Also, mark the base - * vfs inode dirty. This ensures that any changes to the mft record are - * written out to disk. - * - * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) - * on the base vfs inode, because even though file data may have been modified, - * it is dirty in the inode meta data rather than the data page cache of the - * inode, and thus there are no data pages that need writing out. Therefore, a - * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the - * other hand, is not sufficient, because ->write_inode needs to be called even - * in case of fdatasync. This needs to happen or the file data would not - * necessarily hit the device synchronously, even though the vfs inode has the - * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just - * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, - * which is not what I_DIRTY_SYNC on its own would suggest. - */ -void __mark_mft_record_dirty(ntfs_inode *ni) -{ - ntfs_inode *base_ni; - - ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); - BUG_ON(NInoAttr(ni)); - mark_ntfs_record_dirty(ni->page, ni->page_ofs); - /* Determine the base vfs inode and mark it dirty, too. */ - mutex_lock(&ni->extent_lock); - if (likely(ni->nr_extents >= 0)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - mutex_unlock(&ni->extent_lock); - __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC); -} - -static const char *ntfs_please_email = "Please email " - "linux-ntfs-dev@lists.sourceforge.net and say that you saw " - "this message. Thank you."; - -/** - * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror - * @vol: ntfs volume on which the mft record to synchronize resides - * @mft_no: mft record number of mft record to synchronize - * @m: mapped, mst protected (extent) mft record to synchronize - * - * Write the mapped, mst protected (extent) mft record @m with mft record - * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol, - * bypassing the page cache and the $MFTMirr inode itself. - * - * This function is only for use at umount time when the mft mirror inode has - * already been disposed off. We BUG() if we are called while the mft mirror - * inode is still attached to the volume. - * - * On success return 0. On error return -errno. - * - * NOTE: This function is not implemented yet as I am not convinced it can - * actually be triggered considering the sequence of commits we do in super.c:: - * ntfs_put_super(). But just in case we provide this place holder as the - * alternative would be either to BUG() or to get a NULL pointer dereference - * and Oops. - */ -static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol, - const unsigned long mft_no, MFT_RECORD *m) -{ - BUG_ON(vol->mftmirr_ino); - ntfs_error(vol->sb, "Umount time mft mirror syncing is not " - "implemented yet. %s", ntfs_please_email); - return -EOPNOTSUPP; -} - -/** - * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror - * @vol: ntfs volume on which the mft record to synchronize resides - * @mft_no: mft record number of mft record to synchronize - * @m: mapped, mst protected (extent) mft record to synchronize - * @sync: if true, wait for i/o completion - * - * Write the mapped, mst protected (extent) mft record @m with mft record - * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol. - * - * On success return 0. On error return -errno and set the volume errors flag - * in the ntfs volume @vol. - * - * NOTE: We always perform synchronous i/o and ignore the @sync parameter. - * - * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just - * schedule i/o via ->writepage or do it via kntfsd or whatever. - */ -int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, - MFT_RECORD *m, int sync) -{ - struct page *page; - unsigned int blocksize = vol->sb->s_blocksize; - int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[MAX_BHS]; - struct buffer_head *bh, *head; - u8 *kmirr; - runlist_element *rl; - unsigned int block_start, block_end, m_start, m_end, page_ofs; - int i_bhs, nr_bhs, err = 0; - unsigned char blocksize_bits = vol->sb->s_blocksize_bits; - - ntfs_debug("Entering for inode 0x%lx.", mft_no); - BUG_ON(!max_bhs); - if (WARN_ON(max_bhs > MAX_BHS)) - return -EINVAL; - if (unlikely(!vol->mftmirr_ino)) { - /* This could happen during umount... */ - err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); - if (likely(!err)) - return err; - goto err_out; - } - /* Get the page containing the mirror copy of the mft record @m. */ - page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >> - (PAGE_SHIFT - vol->mft_record_size_bits)); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to map mft mirror page."); - err = PTR_ERR(page); - goto err_out; - } - lock_page(page); - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - /* Offset of the mft mirror record inside the page. */ - page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; - /* The address in the page of the mirror copy of the mft record @m. */ - kmirr = page_address(page) + page_ofs; - /* Copy the mst protected mft record to the mirror. */ - memcpy(kmirr, m, vol->mft_record_size); - /* Create uptodate buffers if not present. */ - if (unlikely(!page_has_buffers(page))) { - struct buffer_head *tail; - - bh = head = alloc_page_buffers(page, blocksize, true); - do { - set_buffer_uptodate(bh); - tail = bh; - bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - attach_page_private(page, head); - } - bh = head = page_buffers(page); - BUG_ON(!bh); - rl = NULL; - nr_bhs = 0; - block_start = 0; - m_start = kmirr - (u8*)page_address(page); - m_end = m_start + vol->mft_record_size; - do { - block_end = block_start + blocksize; - /* If the buffer is outside the mft record, skip it. */ - if (block_end <= m_start) - continue; - if (unlikely(block_start >= m_end)) - break; - /* Need to map the buffer if it is not mapped already. */ - if (unlikely(!buffer_mapped(bh))) { - VCN vcn; - LCN lcn; - unsigned int vcn_ofs; - - bh->b_bdev = vol->sb->s_bdev; - /* Obtain the vcn and offset of the current block. */ - vcn = ((VCN)mft_no << vol->mft_record_size_bits) + - (block_start - m_start); - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { - down_read(&NTFS_I(vol->mftmirr_ino)-> - runlist.lock); - rl = NTFS_I(vol->mftmirr_ino)->runlist.rl; - /* - * $MFTMirr always has the whole of its runlist - * in memory. - */ - BUG_ON(!rl); - } - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - /* For $MFTMirr, only lcn >= 0 is a successful remap. */ - if (likely(lcn >= 0)) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << - vol->cluster_size_bits) + - vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - } else { - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Cannot write mft mirror " - "record 0x%lx because its " - "location on disk could not " - "be determined (error code " - "%lli).", mft_no, - (long long)lcn); - err = -EIO; - } - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(!nr_bhs && (m_start != block_start)); - BUG_ON(nr_bhs >= max_bhs); - bhs[nr_bhs++] = bh; - BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); - } while (block_start = block_end, (bh = bh->b_this_page) != head); - if (unlikely(rl)) - up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock); - if (likely(!err)) { - /* Lock buffers and start synchronous write i/o on them. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - if (!trylock_buffer(tbh)) - BUG(); - BUG_ON(!buffer_uptodate(tbh)); - clear_buffer_dirty(tbh); - get_bh(tbh); - tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, tbh); - } - /* Wait on i/o completion of buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - wait_on_buffer(tbh); - if (unlikely(!buffer_uptodate(tbh))) { - err = -EIO; - /* - * Set the buffer uptodate so the page and - * buffer states do not become out of sync. - */ - set_buffer_uptodate(tbh); - } - } - } else /* if (unlikely(err)) */ { - /* Clean the buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) - clear_buffer_dirty(bhs[i_bhs]); - } - /* Current state: all buffers are clean, unlocked, and uptodate. */ - /* Remove the mst protection fixups again. */ - post_write_mst_fixup((NTFS_RECORD*)kmirr); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - if (likely(!err)) { - ntfs_debug("Done."); - } else { - ntfs_error(vol->sb, "I/O error while writing mft mirror " - "record 0x%lx!", mft_no); -err_out: - ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error " - "code %i). Volume will be left marked dirty " - "on umount. Run ntfsfix on the partition " - "after umounting to correct this.", -err); - NVolSetErrors(vol); - } - return err; -} - -/** - * write_mft_record_nolock - write out a mapped (extent) mft record - * @ni: ntfs inode describing the mapped (extent) mft record - * @m: mapped (extent) mft record to write - * @sync: if true, wait for i/o completion - * - * Write the mapped (extent) mft record @m described by the (regular or extent) - * ntfs inode @ni to backing store. If the mft record @m has a counterpart in - * the mft mirror, that is also updated. - * - * We only write the mft record if the ntfs inode @ni is dirty and the first - * buffer belonging to its mft record is dirty, too. We ignore the dirty state - * of subsequent buffers because we could have raced with - * fs/ntfs/aops.c::mark_ntfs_record_dirty(). - * - * On success, clean the mft record and return 0. On error, leave the mft - * record dirty and return -errno. - * - * NOTE: We always perform synchronous i/o and ignore the @sync parameter. - * However, if the mft record has a counterpart in the mft mirror and @sync is - * true, we write the mft record, wait for i/o completion, and only then write - * the mft mirror copy. This ensures that if the system crashes either the mft - * or the mft mirror will contain a self-consistent mft record @m. If @sync is - * false on the other hand, we start i/o on both and then wait for completion - * on them. This provides a speedup but no longer guarantees that you will end - * up with a self-consistent mft record in the case of a crash but if you asked - * for asynchronous writing you probably do not care about that anyway. - * - * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just - * schedule i/o via ->writepage or do it via kntfsd or whatever. - */ -int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) -{ - ntfs_volume *vol = ni->vol; - struct page *page = ni->page; - unsigned int blocksize = vol->sb->s_blocksize; - unsigned char blocksize_bits = vol->sb->s_blocksize_bits; - int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[MAX_BHS]; - struct buffer_head *bh, *head; - runlist_element *rl; - unsigned int block_start, block_end, m_start, m_end; - int i_bhs, nr_bhs, err = 0; - - ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); - BUG_ON(NInoAttr(ni)); - BUG_ON(!max_bhs); - BUG_ON(!PageLocked(page)); - if (WARN_ON(max_bhs > MAX_BHS)) { - err = -EINVAL; - goto err_out; - } - /* - * If the ntfs_inode is clean no need to do anything. If it is dirty, - * mark it as clean now so that it can be redirtied later on if needed. - * There is no danger of races since the caller is holding the locks - * for the mft record @m and the page it is in. - */ - if (!NInoTestClearDirty(ni)) - goto done; - bh = head = page_buffers(page); - BUG_ON(!bh); - rl = NULL; - nr_bhs = 0; - block_start = 0; - m_start = ni->page_ofs; - m_end = m_start + vol->mft_record_size; - do { - block_end = block_start + blocksize; - /* If the buffer is outside the mft record, skip it. */ - if (block_end <= m_start) - continue; - if (unlikely(block_start >= m_end)) - break; - /* - * If this block is not the first one in the record, we ignore - * the buffer's dirty state because we could have raced with a - * parallel mark_ntfs_record_dirty(). - */ - if (block_start == m_start) { - /* This block is the first one in the record. */ - if (!buffer_dirty(bh)) { - BUG_ON(nr_bhs); - /* Clean records are not written out. */ - break; - } - } - /* Need to map the buffer if it is not mapped already. */ - if (unlikely(!buffer_mapped(bh))) { - VCN vcn; - LCN lcn; - unsigned int vcn_ofs; - - bh->b_bdev = vol->sb->s_bdev; - /* Obtain the vcn and offset of the current block. */ - vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) + - (block_start - m_start); - vcn_ofs = vcn & vol->cluster_size_mask; - vcn >>= vol->cluster_size_bits; - if (!rl) { - down_read(&NTFS_I(vol->mft_ino)->runlist.lock); - rl = NTFS_I(vol->mft_ino)->runlist.rl; - BUG_ON(!rl); - } - /* Seek to element containing target vcn. */ - while (rl->length && rl[1].vcn <= vcn) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); - /* For $MFT, only lcn >= 0 is a successful remap. */ - if (likely(lcn >= 0)) { - /* Setup buffer head to correct block. */ - bh->b_blocknr = ((lcn << - vol->cluster_size_bits) + - vcn_ofs) >> blocksize_bits; - set_buffer_mapped(bh); - } else { - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Cannot write mft record " - "0x%lx because its location " - "on disk could not be " - "determined (error code %lli).", - ni->mft_no, (long long)lcn); - err = -EIO; - } - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(!nr_bhs && (m_start != block_start)); - BUG_ON(nr_bhs >= max_bhs); - bhs[nr_bhs++] = bh; - BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); - } while (block_start = block_end, (bh = bh->b_this_page) != head); - if (unlikely(rl)) - up_read(&NTFS_I(vol->mft_ino)->runlist.lock); - if (!nr_bhs) - goto done; - if (unlikely(err)) - goto cleanup_out; - /* Apply the mst protection fixups. */ - err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); - if (err) { - ntfs_error(vol->sb, "Failed to apply mst fixups!"); - goto cleanup_out; - } - flush_dcache_mft_record_page(ni); - /* Lock buffers and start synchronous write i/o on them. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - if (!trylock_buffer(tbh)) - BUG(); - BUG_ON(!buffer_uptodate(tbh)); - clear_buffer_dirty(tbh); - get_bh(tbh); - tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, tbh); - } - /* Synchronize the mft mirror now if not @sync. */ - if (!sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); - /* Wait on i/o completion of buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { - struct buffer_head *tbh = bhs[i_bhs]; - - wait_on_buffer(tbh); - if (unlikely(!buffer_uptodate(tbh))) { - err = -EIO; - /* - * Set the buffer uptodate so the page and buffer - * states do not become out of sync. - */ - if (PageUptodate(page)) - set_buffer_uptodate(tbh); - } - } - /* If @sync, now synchronize the mft mirror. */ - if (sync && ni->mft_no < vol->mftmirr_size) - ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); - /* Remove the mst protection fixups again. */ - post_write_mst_fixup((NTFS_RECORD*)m); - flush_dcache_mft_record_page(ni); - if (unlikely(err)) { - /* I/O error during writing. This is really bad! */ - ntfs_error(vol->sb, "I/O error while writing mft record " - "0x%lx! Marking base inode as bad. You " - "should unmount the volume and run chkdsk.", - ni->mft_no); - goto err_out; - } -done: - ntfs_debug("Done."); - return 0; -cleanup_out: - /* Clean the buffers. */ - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) - clear_buffer_dirty(bhs[i_bhs]); -err_out: - /* - * Current state: all buffers are clean, unlocked, and uptodate. - * The caller should mark the base inode as bad so that no more i/o - * happens. ->clear_inode() will still be invoked so all extent inodes - * and other allocated memory will be freed. - */ - if (err == -ENOMEM) { - ntfs_error(vol->sb, "Not enough memory to write mft record. " - "Redirtying so the write is retried later."); - mark_mft_record_dirty(ni); - err = 0; - } else - NVolSetErrors(vol); - return err; -} - -/** - * ntfs_may_write_mft_record - check if an mft record may be written out - * @vol: [IN] ntfs volume on which the mft record to check resides - * @mft_no: [IN] mft record number of the mft record to check - * @m: [IN] mapped mft record to check - * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned - * - * Check if the mapped (base or extent) mft record @m with mft record number - * @mft_no belonging to the ntfs volume @vol may be written out. If necessary - * and possible the ntfs inode of the mft record is locked and the base vfs - * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The - * caller is responsible for unlocking the ntfs inode and unpinning the base - * vfs inode. - * - * Return 'true' if the mft record may be written out and 'false' if not. - * - * The caller has locked the page and cleared the uptodate flag on it which - * means that we can safely write out any dirty mft records that do not have - * their inodes in icache as determined by ilookup5() as anyone - * opening/creating such an inode would block when attempting to map the mft - * record in read_cache_page() until we are finished with the write out. - * - * Here is a description of the tests we perform: - * - * If the inode is found in icache we know the mft record must be a base mft - * record. If it is dirty, we do not write it and return 'false' as the vfs - * inode write paths will result in the access times being updated which would - * cause the base mft record to be redirtied and written out again. (We know - * the access time update will modify the base mft record because Windows - * chkdsk complains if the standard information attribute is not in the base - * mft record.) - * - * If the inode is in icache and not dirty, we attempt to lock the mft record - * and if we find the lock was already taken, it is not safe to write the mft - * record and we return 'false'. - * - * If we manage to obtain the lock we have exclusive access to the mft record, - * which also allows us safe writeout of the mft record. We then set - * @locked_ni to the locked ntfs inode and return 'true'. - * - * Note we cannot just lock the mft record and sleep while waiting for the lock - * because this would deadlock due to lock reversal (normally the mft record is - * locked before the page is locked but we already have the page locked here - * when we try to lock the mft record). - * - * If the inode is not in icache we need to perform further checks. - * - * If the mft record is not a FILE record or it is a base mft record, we can - * safely write it and return 'true'. - * - * We now know the mft record is an extent mft record. We check if the inode - * corresponding to its base mft record is in icache and obtain a reference to - * it if it is. If it is not, we can safely write it and return 'true'. - * - * We now have the base inode for the extent mft record. We check if it has an - * ntfs inode for the extent mft record attached and if not it is safe to write - * the extent mft record and we return 'true'. - * - * The ntfs inode for the extent mft record is attached to the base inode so we - * attempt to lock the extent mft record and if we find the lock was already - * taken, it is not safe to write the extent mft record and we return 'false'. - * - * If we manage to obtain the lock we have exclusive access to the extent mft - * record, which also allows us safe writeout of the extent mft record. We - * set the ntfs inode of the extent mft record clean and then set @locked_ni to - * the now locked ntfs inode and return 'true'. - * - * Note, the reason for actually writing dirty mft records here and not just - * relying on the vfs inode dirty code paths is that we can have mft records - * modified without them ever having actual inodes in memory. Also we can have - * dirty mft records with clean ntfs inodes in memory. None of the described - * cases would result in the dirty mft records being written out if we only - * relied on the vfs inode dirty code paths. And these cases can really occur - * during allocation of new mft records and in particular when the - * initialized_size of the $MFT/$DATA attribute is extended and the new space - * is initialized using ntfs_mft_record_format(). The clean inode can then - * appear if the mft record is reused for a new inode before it got written - * out. - */ -bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, - const MFT_RECORD *m, ntfs_inode **locked_ni) -{ - struct super_block *sb = vol->sb; - struct inode *mft_vi = vol->mft_ino; - struct inode *vi; - ntfs_inode *ni, *eni, **extent_nis; - int i; - ntfs_attr na; - - ntfs_debug("Entering for inode 0x%lx.", mft_no); - /* - * Normally we do not return a locked inode so set @locked_ni to NULL. - */ - BUG_ON(!locked_ni); - *locked_ni = NULL; - /* - * Check if the inode corresponding to this mft record is in the VFS - * inode cache and obtain a reference to it if it is. - */ - ntfs_debug("Looking for inode 0x%lx in icache.", mft_no); - na.mft_no = mft_no; - na.name = NULL; - na.name_len = 0; - na.type = AT_UNUSED; - /* - * Optimize inode 0, i.e. $MFT itself, since we have it in memory and - * we get here for it rather often. - */ - if (!mft_no) { - /* Balance the below iput(). */ - vi = igrab(mft_vi); - BUG_ON(vi != mft_vi); - } else { - /* - * Have to use ilookup5_nowait() since ilookup5() waits for the - * inode lock which causes ntfs to deadlock when a concurrent - * inode write via the inode dirty code paths and the page - * dirty code path of the inode dirty code path when writing - * $MFT occurs. - */ - vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na); - } - if (vi) { - ntfs_debug("Base inode 0x%lx is in icache.", mft_no); - /* The inode is in icache. */ - ni = NTFS_I(vi); - /* Take a reference to the ntfs inode. */ - atomic_inc(&ni->count); - /* If the inode is dirty, do not write this record. */ - if (NInoDirty(ni)) { - ntfs_debug("Inode 0x%lx is dirty, do not write it.", - mft_no); - atomic_dec(&ni->count); - iput(vi); - return false; - } - ntfs_debug("Inode 0x%lx is not dirty.", mft_no); - /* The inode is not dirty, try to take the mft record lock. */ - if (unlikely(!mutex_trylock(&ni->mrec_lock))) { - ntfs_debug("Mft record 0x%lx is already locked, do " - "not write it.", mft_no); - atomic_dec(&ni->count); - iput(vi); - return false; - } - ntfs_debug("Managed to lock mft record 0x%lx, write it.", - mft_no); - /* - * The write has to occur while we hold the mft record lock so - * return the locked ntfs inode. - */ - *locked_ni = ni; - return true; - } - ntfs_debug("Inode 0x%lx is not in icache.", mft_no); - /* The inode is not in icache. */ - /* Write the record if it is not a mft record (type "FILE"). */ - if (!ntfs_is_mft_record(m->magic)) { - ntfs_debug("Mft record 0x%lx is not a FILE record, write it.", - mft_no); - return true; - } - /* Write the mft record if it is a base inode. */ - if (!m->base_mft_record) { - ntfs_debug("Mft record 0x%lx is a base record, write it.", - mft_no); - return true; - } - /* - * This is an extent mft record. Check if the inode corresponding to - * its base mft record is in icache and obtain a reference to it if it - * is. - */ - na.mft_no = MREF_LE(m->base_mft_record); - ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " - "inode 0x%lx in icache.", mft_no, na.mft_no); - if (!na.mft_no) { - /* Balance the below iput(). */ - vi = igrab(mft_vi); - BUG_ON(vi != mft_vi); - } else - vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode, - &na); - if (!vi) { - /* - * The base inode is not in icache, write this extent mft - * record. - */ - ntfs_debug("Base inode 0x%lx is not in icache, write the " - "extent record.", na.mft_no); - return true; - } - ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no); - /* - * The base inode is in icache. Check if it has the extent inode - * corresponding to this extent mft record attached. - */ - ni = NTFS_I(vi); - mutex_lock(&ni->extent_lock); - if (ni->nr_extents <= 0) { - /* - * The base inode has no attached extent inodes, write this - * extent mft record. - */ - mutex_unlock(&ni->extent_lock); - iput(vi); - ntfs_debug("Base inode 0x%lx has no attached extent inodes, " - "write the extent record.", na.mft_no); - return true; - } - /* Iterate over the attached extent inodes. */ - extent_nis = ni->ext.extent_ntfs_inos; - for (eni = NULL, i = 0; i < ni->nr_extents; ++i) { - if (mft_no == extent_nis[i]->mft_no) { - /* - * Found the extent inode corresponding to this extent - * mft record. - */ - eni = extent_nis[i]; - break; - } - } - /* - * If the extent inode was not attached to the base inode, write this - * extent mft record. - */ - if (!eni) { - mutex_unlock(&ni->extent_lock); - iput(vi); - ntfs_debug("Extent inode 0x%lx is not attached to its base " - "inode 0x%lx, write the extent record.", - mft_no, na.mft_no); - return true; - } - ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.", - mft_no, na.mft_no); - /* Take a reference to the extent ntfs inode. */ - atomic_inc(&eni->count); - mutex_unlock(&ni->extent_lock); - /* - * Found the extent inode coresponding to this extent mft record. - * Try to take the mft record lock. - */ - if (unlikely(!mutex_trylock(&eni->mrec_lock))) { - atomic_dec(&eni->count); - iput(vi); - ntfs_debug("Extent mft record 0x%lx is already locked, do " - "not write it.", mft_no); - return false; - } - ntfs_debug("Managed to lock extent mft record 0x%lx, write it.", - mft_no); - if (NInoTestClearDirty(eni)) - ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.", - mft_no); - /* - * The write has to occur while we hold the mft record lock so return - * the locked extent ntfs inode. - */ - *locked_ni = eni; - return true; -} - -static const char *es = " Leaving inconsistent metadata. Unmount and run " - "chkdsk."; - -/** - * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name - * @vol: volume on which to search for a free mft record - * @base_ni: open base inode if allocating an extent mft record or NULL - * - * Search for a free mft record in the mft bitmap attribute on the ntfs volume - * @vol. - * - * If @base_ni is NULL start the search at the default allocator position. - * - * If @base_ni is not NULL start the search at the mft record after the base - * mft record @base_ni. - * - * Return the free mft record on success and -errno on error. An error code of - * -ENOSPC means that there are no free mft records in the currently - * initialized mft bitmap. - * - * Locking: Caller must hold vol->mftbmp_lock for writing. - */ -static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, - ntfs_inode *base_ni) -{ - s64 pass_end, ll, data_pos, pass_start, ofs, bit; - unsigned long flags; - struct address_space *mftbmp_mapping; - u8 *buf, *byte; - struct page *page; - unsigned int page_ofs, size; - u8 pass, b; - - ntfs_debug("Searching for free mft record in the currently " - "initialized mft bitmap."); - mftbmp_mapping = vol->mftbmp_ino->i_mapping; - /* - * Set the end of the pass making sure we do not overflow the mft - * bitmap. - */ - read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags); - pass_end = NTFS_I(vol->mft_ino)->allocated_size >> - vol->mft_record_size_bits; - read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags); - read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); - ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3; - read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); - if (pass_end > ll) - pass_end = ll; - pass = 1; - if (!base_ni) - data_pos = vol->mft_data_pos; - else - data_pos = base_ni->mft_no + 1; - if (data_pos < 24) - data_pos = 24; - if (data_pos >= pass_end) { - data_pos = 24; - pass = 2; - /* This happens on a freshly formatted volume. */ - if (data_pos >= pass_end) - return -ENOSPC; - } - pass_start = data_pos; - ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, " - "pass_end 0x%llx, data_pos 0x%llx.", pass, - (long long)pass_start, (long long)pass_end, - (long long)data_pos); - /* Loop until a free mft record is found. */ - for (; pass <= 2;) { - /* Cap size to pass_end. */ - ofs = data_pos >> 3; - page_ofs = ofs & ~PAGE_MASK; - size = PAGE_SIZE - page_ofs; - ll = ((pass_end + 7) >> 3) - ofs; - if (size > ll) - size = ll; - size <<= 3; - /* - * If we are still within the active pass, search the next page - * for a zero bit. - */ - if (size) { - page = ntfs_map_page(mftbmp_mapping, - ofs >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read mft " - "bitmap, aborting."); - return PTR_ERR(page); - } - buf = (u8*)page_address(page) + page_ofs; - bit = data_pos & 7; - data_pos &= ~7ull; - ntfs_debug("Before inner for loop: size 0x%x, " - "data_pos 0x%llx, bit 0x%llx", size, - (long long)data_pos, (long long)bit); - for (; bit < size && data_pos + bit < pass_end; - bit &= ~7ull, bit += 8) { - byte = buf + (bit >> 3); - if (*byte == 0xff) - continue; - b = ffz((unsigned long)*byte); - if (b < 8 && b >= (bit & 7)) { - ll = data_pos + (bit & ~7ull) + b; - if (unlikely(ll > (1ll << 32))) { - ntfs_unmap_page(page); - return -ENOSPC; - } - *byte |= 1 << b; - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - ntfs_debug("Done. (Found and " - "allocated mft record " - "0x%llx.)", - (long long)ll); - return ll; - } - } - ntfs_debug("After inner for loop: size 0x%x, " - "data_pos 0x%llx, bit 0x%llx", size, - (long long)data_pos, (long long)bit); - data_pos += size; - ntfs_unmap_page(page); - /* - * If the end of the pass has not been reached yet, - * continue searching the mft bitmap for a zero bit. - */ - if (data_pos < pass_end) - continue; - } - /* Do the next pass. */ - if (++pass == 2) { - /* - * Starting the second pass, in which we scan the first - * part of the zone which we omitted earlier. - */ - pass_end = pass_start; - data_pos = pass_start = 24; - ntfs_debug("pass %i, pass_start 0x%llx, pass_end " - "0x%llx.", pass, (long long)pass_start, - (long long)pass_end); - if (data_pos >= pass_end) - break; - } - } - /* No free mft records in currently initialized mft bitmap. */ - ntfs_debug("Done. (No free mft records left in currently initialized " - "mft bitmap.)"); - return -ENOSPC; -} - -/** - * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster - * @vol: volume on which to extend the mft bitmap attribute - * - * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster. - * - * Note: Only changes allocated_size, i.e. does not touch initialized_size or - * data_size. - * - * Return 0 on success and -errno on error. - * - * Locking: - Caller must hold vol->mftbmp_lock for writing. - * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for - * writing and releases it before returning. - * - This function takes vol->lcnbmp_lock for writing and releases it - * before returning. - */ -static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) -{ - LCN lcn; - s64 ll; - unsigned long flags; - struct page *page; - ntfs_inode *mft_ni, *mftbmp_ni; - runlist_element *rl, *rl2 = NULL; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *mrec; - ATTR_RECORD *a = NULL; - int ret, mp_size; - u32 old_alen = 0; - u8 *b, tb; - struct { - u8 added_cluster:1; - u8 added_run:1; - u8 mp_rebuilt:1; - } status = { 0, 0, 0 }; - - ntfs_debug("Extending mft bitmap allocation."); - mft_ni = NTFS_I(vol->mft_ino); - mftbmp_ni = NTFS_I(vol->mftbmp_ino); - /* - * Determine the last lcn of the mft bitmap. The allocated size of the - * mft bitmap cannot be zero so we are ok to do this. - */ - down_write(&mftbmp_ni->runlist.lock); - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ll = mftbmp_ni->allocated_size; - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, - (ll - 1) >> vol->cluster_size_bits, NULL); - if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to determine last allocated " - "cluster of mft bitmap attribute."); - if (!IS_ERR(rl)) - ret = -EIO; - else - ret = PTR_ERR(rl); - return ret; - } - lcn = rl->lcn + rl->length; - ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.", - (long long)lcn); - /* - * Attempt to get the cluster following the last allocated cluster by - * hand as it may be in the MFT zone so the allocator would not give it - * to us. - */ - ll = lcn >> 3; - page = ntfs_map_page(vol->lcnbmp_ino->i_mapping, - ll >> PAGE_SHIFT); - if (IS_ERR(page)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to read from lcn bitmap."); - return PTR_ERR(page); - } - b = (u8*)page_address(page) + (ll & ~PAGE_MASK); - tb = 1 << (lcn & 7ull); - down_write(&vol->lcnbmp_lock); - if (*b != 0xff && !(*b & tb)) { - /* Next cluster is free, allocate it. */ - *b |= tb; - flush_dcache_page(page); - set_page_dirty(page); - up_write(&vol->lcnbmp_lock); - ntfs_unmap_page(page); - /* Update the mft bitmap runlist. */ - rl->length++; - rl[1].vcn++; - status.added_cluster = 1; - ntfs_debug("Appending one cluster to mft bitmap."); - } else { - up_write(&vol->lcnbmp_lock); - ntfs_unmap_page(page); - /* Allocate a cluster from the DATA_ZONE. */ - rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE, - true); - if (IS_ERR(rl2)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to allocate a cluster for " - "the mft bitmap."); - return PTR_ERR(rl2); - } - rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - up_write(&mftbmp_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to merge runlists for mft " - "bitmap."); - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to deallocate " - "allocated cluster.%s", es); - NVolSetErrors(vol); - } - ntfs_free(rl2); - return PTR_ERR(rl); - } - mftbmp_ni->runlist.rl = rl; - status.added_run = 1; - ntfs_debug("Adding one run to mft bitmap."); - /* Find the last run in the new runlist. */ - for (; rl[1].length; rl++) - ; - } - /* - * Update the attribute record as well. Note: @rl is the last - * (non-terminator) runlist element of mft bitmap. - */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record."); - ret = PTR_ERR(mrec); - goto undo_alloc; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - ret = -ENOMEM; - goto undo_alloc; - } - ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, - 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft bitmap attribute."); - if (ret == -ENOENT) - ret = -EIO; - goto undo_alloc; - } - a = ctx->attr; - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - /* Search back for the previous last allocated cluster of mft bitmap. */ - for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) { - if (ll >= rl2->vcn) - break; - } - BUG_ON(ll < rl2->vcn); - BUG_ON(ll >= rl2->vcn + rl2->length); - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - ntfs_error(vol->sb, "Get size for mapping pairs failed for " - "mft bitmap attribute extent."); - ret = mp_size; - if (!ret) - ret = -EIO; - goto undo_alloc; - } - /* Expand the attribute record if necessary. */ - old_alen = le32_to_cpu(a->length); - ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(ret)) { - if (ret != -ENOSPC) { - ntfs_error(vol->sb, "Failed to resize attribute " - "record for mft bitmap attribute."); - goto undo_alloc; - } - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record or by - // moving other attributes out of this mft record. - // Note: It will need to be a special mft record and if none of - // those are available it gets rather complicated... - ntfs_error(vol->sb, "Not enough space in this mft record to " - "accommodate extended mft bitmap attribute " - "extent. Cannot handle this yet."); - ret = -EOPNOTSUPP; - goto undo_alloc; - } - status.mp_rebuilt = 1; - /* Generate the mapping pairs array directly into the attr record. */ - ret = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to build mapping pairs array for " - "mft bitmap attribute."); - goto undo_alloc; - } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); - /* - * We now have extended the mft bitmap allocated_size by one cluster. - * Reflect this in the ntfs_inode structure and the attribute record. - */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find first attribute " - "extent of mft bitmap attribute."); - goto restore_undo_alloc; - } - a = ctx->attr; - } - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - mftbmp_ni->allocated_size += vol->cluster_size; - a->data.non_resident.allocated_size = - cpu_to_sle64(mftbmp_ni->allocated_size); - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mftbmp_ni->runlist.lock); - ntfs_debug("Done."); - return 0; -restore_undo_alloc: - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, - 0, ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft bitmap attribute.%s", es); - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - mftbmp_ni->allocated_size += vol->cluster_size; - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mftbmp_ni->runlist.lock); - /* - * The only thing that is now wrong is ->allocated_size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); - return ret; - } - a = ctx->attr; - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2); -undo_alloc: - if (status.added_cluster) { - /* Truncate the last run in the runlist by one cluster. */ - rl->length--; - rl[1].vcn--; - } else if (status.added_run) { - lcn = rl->lcn; - /* Remove the last run from the runlist. */ - rl->lcn = rl[1].lcn; - rl->length = 0; - } - /* Deallocate the cluster. */ - down_write(&vol->lcnbmp_lock); - if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { - ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); - if (status.mp_rebuilt) { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " - "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record.%s", es); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (!IS_ERR(mrec)) - unmap_mft_record(mft_ni); - up_write(&mftbmp_ni->runlist.lock); - return ret; -} - -/** - * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data - * @vol: volume on which to extend the mft bitmap attribute - * - * Extend the initialized portion of the mft bitmap attribute on the ntfs - * volume @vol by 8 bytes. - * - * Note: Only changes initialized_size and data_size, i.e. requires that - * allocated_size is big enough to fit the new initialized_size. - * - * Return 0 on success and -error on error. - * - * Locking: Caller must hold vol->mftbmp_lock for writing. - */ -static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) -{ - s64 old_data_size, old_initialized_size; - unsigned long flags; - struct inode *mftbmp_vi; - ntfs_inode *mft_ni, *mftbmp_ni; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *mrec; - ATTR_RECORD *a; - int ret; - - ntfs_debug("Extending mft bitmap initiailized (and data) size."); - mft_ni = NTFS_I(vol->mft_ino); - mftbmp_vi = vol->mftbmp_ino; - mftbmp_ni = NTFS_I(mftbmp_vi); - /* Get the attribute record. */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record."); - return PTR_ERR(mrec); - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - ret = -ENOMEM; - goto unm_err_out; - } - ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find first attribute extent of " - "mft bitmap attribute."); - if (ret == -ENOENT) - ret = -EIO; - goto put_err_out; - } - a = ctx->attr; - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - old_data_size = i_size_read(mftbmp_vi); - old_initialized_size = mftbmp_ni->initialized_size; - /* - * We can simply update the initialized_size before filling the space - * with zeroes because the caller is holding the mft bitmap lock for - * writing which ensures that no one else is trying to access the data. - */ - mftbmp_ni->initialized_size += 8; - a->data.non_resident.initialized_size = - cpu_to_sle64(mftbmp_ni->initialized_size); - if (mftbmp_ni->initialized_size > old_data_size) { - i_size_write(mftbmp_vi, mftbmp_ni->initialized_size); - a->data.non_resident.data_size = - cpu_to_sle64(mftbmp_ni->initialized_size); - } - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - /* Initialize the mft bitmap attribute value with zeroes. */ - ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0); - if (likely(!ret)) { - ntfs_debug("Done. (Wrote eight initialized bytes to mft " - "bitmap."); - return 0; - } - ntfs_error(vol->sb, "Failed to write to mft bitmap."); - /* Try to recover from the error. */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record.%s", es); - NVolSetErrors(vol); - return ret; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context.%s", es); - NVolSetErrors(vol); - goto unm_err_out; - } - if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, - mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find first attribute extent of " - "mft bitmap attribute.%s", es); - NVolSetErrors(vol); -put_err_out: - ntfs_attr_put_search_ctx(ctx); -unm_err_out: - unmap_mft_record(mft_ni); - goto err_out; - } - a = ctx->attr; - write_lock_irqsave(&mftbmp_ni->size_lock, flags); - mftbmp_ni->initialized_size = old_initialized_size; - a->data.non_resident.initialized_size = - cpu_to_sle64(old_initialized_size); - if (i_size_read(mftbmp_vi) != old_data_size) { - i_size_write(mftbmp_vi, old_data_size); - a->data.non_resident.data_size = cpu_to_sle64(old_data_size); - } - write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); -#ifdef DEBUG - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, " - "data_size 0x%llx, initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)i_size_read(mftbmp_vi), - (long long)mftbmp_ni->initialized_size); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); -#endif /* DEBUG */ -err_out: - return ret; -} - -/** - * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute - * @vol: volume on which to extend the mft data attribute - * - * Extend the mft data attribute on the ntfs volume @vol by 16 mft records - * worth of clusters or if not enough space for this by one mft record worth - * of clusters. - * - * Note: Only changes allocated_size, i.e. does not touch initialized_size or - * data_size. - * - * Return 0 on success and -errno on error. - * - * Locking: - Caller must hold vol->mftbmp_lock for writing. - * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for - * writing and releases it before returning. - * - This function calls functions which take vol->lcnbmp_lock for - * writing and release it before returning. - */ -static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) -{ - LCN lcn; - VCN old_last_vcn; - s64 min_nr, nr, ll; - unsigned long flags; - ntfs_inode *mft_ni; - runlist_element *rl, *rl2; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *mrec; - ATTR_RECORD *a = NULL; - int ret, mp_size; - u32 old_alen = 0; - bool mp_rebuilt = false; - - ntfs_debug("Extending mft data allocation."); - mft_ni = NTFS_I(vol->mft_ino); - /* - * Determine the preferred allocation location, i.e. the last lcn of - * the mft data attribute. The allocated size of the mft data - * attribute cannot be zero so we are ok to do this. - */ - down_write(&mft_ni->runlist.lock); - read_lock_irqsave(&mft_ni->size_lock, flags); - ll = mft_ni->allocated_size; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - rl = ntfs_attr_find_vcn_nolock(mft_ni, - (ll - 1) >> vol->cluster_size_bits, NULL); - if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { - up_write(&mft_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to determine last allocated " - "cluster of mft data attribute."); - if (!IS_ERR(rl)) - ret = -EIO; - else - ret = PTR_ERR(rl); - return ret; - } - lcn = rl->lcn + rl->length; - ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn); - /* Minimum allocation is one mft record worth of clusters. */ - min_nr = vol->mft_record_size >> vol->cluster_size_bits; - if (!min_nr) - min_nr = 1; - /* Want to allocate 16 mft records worth of clusters. */ - nr = vol->mft_record_size << 4 >> vol->cluster_size_bits; - if (!nr) - nr = min_nr; - /* Ensure we do not go above 2^32-1 mft records. */ - read_lock_irqsave(&mft_ni->size_lock, flags); - ll = mft_ni->allocated_size; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - if (unlikely((ll + (nr << vol->cluster_size_bits)) >> - vol->mft_record_size_bits >= (1ll << 32))) { - nr = min_nr; - if (unlikely((ll + (nr << vol->cluster_size_bits)) >> - vol->mft_record_size_bits >= (1ll << 32))) { - ntfs_warning(vol->sb, "Cannot allocate mft record " - "because the maximum number of inodes " - "(2^32) has already been reached."); - up_write(&mft_ni->runlist.lock); - return -ENOSPC; - } - } - ntfs_debug("Trying mft data allocation with %s cluster count %lli.", - nr > min_nr ? "default" : "minimal", (long long)nr); - old_last_vcn = rl[1].vcn; - do { - rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, - true); - if (!IS_ERR(rl2)) - break; - if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { - ntfs_error(vol->sb, "Failed to allocate the minimal " - "number of clusters (%lli) for the " - "mft data attribute.", (long long)nr); - up_write(&mft_ni->runlist.lock); - return PTR_ERR(rl2); - } - /* - * There is not enough space to do the allocation, but there - * might be enough space to do a minimal allocation so try that - * before failing. - */ - nr = min_nr; - ntfs_debug("Retrying mft data allocation with minimal cluster " - "count %lli.", (long long)nr); - } while (1); - rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - up_write(&mft_ni->runlist.lock); - ntfs_error(vol->sb, "Failed to merge runlists for mft data " - "attribute."); - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to deallocate clusters " - "from the mft data attribute.%s", es); - NVolSetErrors(vol); - } - ntfs_free(rl2); - return PTR_ERR(rl); - } - mft_ni->runlist.rl = rl; - ntfs_debug("Allocated %lli clusters.", (long long)nr); - /* Find the last run in the new runlist. */ - for (; rl[1].length; rl++) - ; - /* Update the attribute record as well. */ - mrec = map_mft_record(mft_ni); - if (IS_ERR(mrec)) { - ntfs_error(vol->sb, "Failed to map mft record."); - ret = PTR_ERR(mrec); - goto undo_alloc; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - ret = -ENOMEM; - goto undo_alloc; - } - ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, - CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft data attribute."); - if (ret == -ENOENT) - ret = -EIO; - goto undo_alloc; - } - a = ctx->attr; - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - /* Search back for the previous last allocated cluster of mft bitmap. */ - for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) { - if (ll >= rl2->vcn) - break; - } - BUG_ON(ll < rl2->vcn); - BUG_ON(ll >= rl2->vcn + rl2->length); - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - ntfs_error(vol->sb, "Get size for mapping pairs failed for " - "mft data attribute extent."); - ret = mp_size; - if (!ret) - ret = -EIO; - goto undo_alloc; - } - /* Expand the attribute record if necessary. */ - old_alen = le32_to_cpu(a->length); - ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(ret)) { - if (ret != -ENOSPC) { - ntfs_error(vol->sb, "Failed to resize attribute " - "record for mft data attribute."); - goto undo_alloc; - } - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record or by - // moving other attributes out of this mft record. - // Note: Use the special reserved mft records and ensure that - // this extent is not required to find the mft record in - // question. If no free special records left we would need to - // move an existing record away, insert ours in its place, and - // then place the moved record into the newly allocated space - // and we would then need to update all references to this mft - // record appropriately. This is rather complicated... - ntfs_error(vol->sb, "Not enough space in this mft record to " - "accommodate extended mft data attribute " - "extent. Cannot handle this yet."); - ret = -EOPNOTSUPP; - goto undo_alloc; - } - mp_rebuilt = true; - /* Generate the mapping pairs array directly into the attr record. */ - ret = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to build mapping pairs array of " - "mft data attribute."); - goto undo_alloc; - } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); - /* - * We now have extended the mft data allocated_size by nr clusters. - * Reflect this in the ntfs_inode structure and the attribute record. - * @rl is the last (non-terminator) runlist element of mft data - * attribute. - */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, - mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, - ctx); - if (unlikely(ret)) { - ntfs_error(vol->sb, "Failed to find first attribute " - "extent of mft data attribute."); - goto restore_undo_alloc; - } - a = ctx->attr; - } - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->allocated_size += nr << vol->cluster_size_bits; - a->data.non_resident.allocated_size = - cpu_to_sle64(mft_ni->allocated_size); - write_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mft_ni->runlist.lock); - ntfs_debug("Done."); - return 0; -restore_undo_alloc: - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, - CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "mft data attribute.%s", es); - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->allocated_size += nr << vol->cluster_size_bits; - write_unlock_irqrestore(&mft_ni->size_lock, flags); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - up_write(&mft_ni->runlist.lock); - /* - * The only thing that is now wrong is ->allocated_size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); - return ret; - } - ctx->attr->data.non_resident.highest_vcn = - cpu_to_sle64(old_last_vcn - 1); -undo_alloc: - if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) { - ntfs_error(vol->sb, "Failed to free clusters from mft data " - "attribute.%s", es); - NVolSetErrors(vol); - } - - if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { - ntfs_error(vol->sb, "Failed to truncate mft data attribute " - "runlist.%s", es); - NVolSetErrors(vol); - } - if (ctx) { - a = ctx->attr; - if (mp_rebuilt && !IS_ERR(ctx->mrec)) { - if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " - "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record.%s", es); - NVolSetErrors(vol); - } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } else if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to restore attribute search " - "context.%s", es); - NVolSetErrors(vol); - } - ntfs_attr_put_search_ctx(ctx); - } - if (!IS_ERR(mrec)) - unmap_mft_record(mft_ni); - up_write(&mft_ni->runlist.lock); - return ret; -} - -/** - * ntfs_mft_record_layout - layout an mft record into a memory buffer - * @vol: volume to which the mft record will belong - * @mft_no: mft reference specifying the mft record number - * @m: destination buffer of size >= @vol->mft_record_size bytes - * - * Layout an empty, unused mft record with the mft record number @mft_no into - * the buffer @m. The volume @vol is needed because the mft record structure - * was modified in NTFS 3.1 so we need to know which volume version this mft - * record will be used on. - * - * Return 0 on success and -errno on error. - */ -static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, - MFT_RECORD *m) -{ - ATTR_RECORD *a; - - ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); - if (mft_no >= (1ll << 32)) { - ntfs_error(vol->sb, "Mft record number 0x%llx exceeds " - "maximum of 2^32.", (long long)mft_no); - return -ERANGE; - } - /* Start by clearing the whole mft record to gives us a clean slate. */ - memset(m, 0, vol->mft_record_size); - /* Aligned to 2-byte boundary. */ - if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver)) - m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1); - else { - m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1); - /* - * Set the NTFS 3.1+ specific fields while we know that the - * volume version is 3.1+. - */ - m->reserved = 0; - m->mft_record_number = cpu_to_le32((u32)mft_no); - } - m->magic = magic_FILE; - if (vol->mft_record_size >= NTFS_BLOCK_SIZE) - m->usa_count = cpu_to_le16(vol->mft_record_size / - NTFS_BLOCK_SIZE + 1); - else { - m->usa_count = cpu_to_le16(1); - ntfs_warning(vol->sb, "Sector size is bigger than mft record " - "size. Setting usa_count to 1. If chkdsk " - "reports this as corruption, please email " - "linux-ntfs-dev@lists.sourceforge.net stating " - "that you saw this message and that the " - "modified filesystem created was corrupt. " - "Thank you."); - } - /* Set the update sequence number to 1. */ - *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1); - m->lsn = 0; - m->sequence_number = cpu_to_le16(1); - m->link_count = 0; - /* - * Place the attributes straight after the update sequence array, - * aligned to 8-byte boundary. - */ - m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) + - (le16_to_cpu(m->usa_count) << 1) + 7) & ~7); - m->flags = 0; - /* - * Using attrs_offset plus eight bytes (for the termination attribute). - * attrs_offset is already aligned to 8-byte boundary, so no need to - * align again. - */ - m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8); - m->bytes_allocated = cpu_to_le32(vol->mft_record_size); - m->base_mft_record = 0; - m->next_attr_instance = 0; - /* Add the termination attribute. */ - a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)); - a->type = AT_END; - a->length = 0; - ntfs_debug("Done."); - return 0; -} - -/** - * ntfs_mft_record_format - format an mft record on an ntfs volume - * @vol: volume on which to format the mft record - * @mft_no: mft record number to format - * - * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused - * mft record into the appropriate place of the mft data attribute. This is - * used when extending the mft data attribute. - * - * Return 0 on success and -errno on error. - */ -static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) -{ - loff_t i_size; - struct inode *mft_vi = vol->mft_ino; - struct page *page; - MFT_RECORD *m; - pgoff_t index, end_index; - unsigned int ofs; - int err; - - ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); - /* - * The index into the page cache and the offset within the page cache - * page of the wanted mft record. - */ - index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT; - ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; - /* The maximum valid index into the page cache for $MFT's data. */ - i_size = i_size_read(mft_vi); - end_index = i_size >> PAGE_SHIFT; - if (unlikely(index >= end_index)) { - if (unlikely(index > end_index || ofs + vol->mft_record_size >= - (i_size & ~PAGE_MASK))) { - ntfs_error(vol->sb, "Tried to format non-existing mft " - "record 0x%llx.", (long long)mft_no); - return -ENOENT; - } - } - /* Read, map, and pin the page containing the mft record. */ - page = ntfs_map_page(mft_vi->i_mapping, index); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to map page containing mft record " - "to format 0x%llx.", (long long)mft_no); - return PTR_ERR(page); - } - lock_page(page); - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - m = (MFT_RECORD*)((u8*)page_address(page) + ofs); - err = ntfs_mft_record_layout(vol, mft_no, m); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.", - (long long)mft_no); - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - return err; - } - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - /* - * Make sure the mft record is written out to disk. We could use - * ilookup5() to check if an inode is in icache and so on but this is - * unnecessary as ntfs_writepage() will write the dirty record anyway. - */ - mark_ntfs_record_dirty(page, ofs); - ntfs_unmap_page(page); - ntfs_debug("Done."); - return 0; -} - -/** - * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume - * @vol: [IN] volume on which to allocate the mft record - * @mode: [IN] mode if want a file or directory, i.e. base inode or 0 - * @base_ni: [IN] open base inode if allocating an extent mft record or NULL - * @mrec: [OUT] on successful return this is the mapped mft record - * - * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol. - * - * If @base_ni is NULL make the mft record a base mft record, i.e. a file or - * direvctory inode, and allocate it at the default allocator position. In - * this case @mode is the file mode as given to us by the caller. We in - * particular use @mode to distinguish whether a file or a directory is being - * created (S_IFDIR(mode) and S_IFREG(mode), respectively). - * - * If @base_ni is not NULL make the allocated mft record an extent record, - * allocate it starting at the mft record after the base mft record and attach - * the allocated and opened ntfs inode to the base inode @base_ni. In this - * case @mode must be 0 as it is meaningless for extent inodes. - * - * You need to check the return value with IS_ERR(). If false, the function - * was successful and the return value is the now opened ntfs inode of the - * allocated mft record. *@mrec is then set to the allocated, mapped, pinned, - * and locked mft record. If IS_ERR() is true, the function failed and the - * error code is obtained from PTR_ERR(return value). *@mrec is undefined in - * this case. - * - * Allocation strategy: - * - * To find a free mft record, we scan the mft bitmap for a zero bit. To - * optimize this we start scanning at the place specified by @base_ni or if - * @base_ni is NULL we start where we last stopped and we perform wrap around - * when we reach the end. Note, we do not try to allocate mft records below - * number 24 because numbers 0 to 15 are the defined system files anyway and 16 - * to 24 are special in that they are used for storing extension mft records - * for the $DATA attribute of $MFT. This is required to avoid the possibility - * of creating a runlist with a circular dependency which once written to disk - * can never be read in again. Windows will only use records 16 to 24 for - * normal files if the volume is completely out of space. We never use them - * which means that when the volume is really out of space we cannot create any - * more files while Windows can still create up to 8 small files. We can start - * doing this at some later time, it does not matter much for now. - * - * When scanning the mft bitmap, we only search up to the last allocated mft - * record. If there are no free records left in the range 24 to number of - * allocated mft records, then we extend the $MFT/$DATA attribute in order to - * create free mft records. We extend the allocated size of $MFT/$DATA by 16 - * records at a time or one cluster, if cluster size is above 16kiB. If there - * is not sufficient space to do this, we try to extend by a single mft record - * or one cluster, if cluster size is above the mft record size. - * - * No matter how many mft records we allocate, we initialize only the first - * allocated mft record, incrementing mft data size and initialized size - * accordingly, open an ntfs_inode for it and return it to the caller, unless - * there are less than 24 mft records, in which case we allocate and initialize - * mft records until we reach record 24 which we consider as the first free mft - * record for use by normal files. - * - * If during any stage we overflow the initialized data in the mft bitmap, we - * extend the initialized size (and data size) by 8 bytes, allocating another - * cluster if required. The bitmap data size has to be at least equal to the - * number of mft records in the mft, but it can be bigger, in which case the - * superflous bits are padded with zeroes. - * - * Thus, when we return successfully (IS_ERR() is false), we will have: - * - initialized / extended the mft bitmap if necessary, - * - initialized / extended the mft data if necessary, - * - set the bit corresponding to the mft record being allocated in the - * mft bitmap, - * - opened an ntfs_inode for the allocated mft record, and we will have - * - returned the ntfs_inode as well as the allocated mapped, pinned, and - * locked mft record. - * - * On error, the volume will be left in a consistent state and no record will - * be allocated. If rolling back a partial operation fails, we may leave some - * inconsistent metadata in which case we set NVolErrors() so the volume is - * left dirty when unmounted. - * - * Note, this function cannot make use of most of the normal functions, like - * for example for attribute resizing, etc, because when the run list overflows - * the base mft record and an attribute list is used, it is very important that - * the extension mft records used to store the $DATA attribute of $MFT can be - * reached without having to read the information contained inside them, as - * this would make it impossible to find them in the first place after the - * volume is unmounted. $MFT/$BITMAP probably does not need to follow this - * rule because the bitmap is not essential for finding the mft records, but on - * the other hand, handling the bitmap in this special way would make life - * easier because otherwise there might be circular invocations of functions - * when reading the bitmap. - */ -ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, - ntfs_inode *base_ni, MFT_RECORD **mrec) -{ - s64 ll, bit, old_data_initialized, old_data_size; - unsigned long flags; - struct inode *vi; - struct page *page; - ntfs_inode *mft_ni, *mftbmp_ni, *ni; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - pgoff_t index; - unsigned int ofs; - int err; - le16 seq_no, usn; - bool record_formatted = false; - - if (base_ni) { - ntfs_debug("Entering (allocating an extent mft record for " - "base mft record 0x%llx).", - (long long)base_ni->mft_no); - /* @mode and @base_ni are mutually exclusive. */ - BUG_ON(mode); - } else - ntfs_debug("Entering (allocating a base mft record)."); - if (mode) { - /* @mode and @base_ni are mutually exclusive. */ - BUG_ON(base_ni); - /* We only support creation of normal files and directories. */ - if (!S_ISREG(mode) && !S_ISDIR(mode)) - return ERR_PTR(-EOPNOTSUPP); - } - BUG_ON(!mrec); - mft_ni = NTFS_I(vol->mft_ino); - mftbmp_ni = NTFS_I(vol->mftbmp_ino); - down_write(&vol->mftbmp_lock); - bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni); - if (bit >= 0) { - ntfs_debug("Found and allocated free record (#1), bit 0x%llx.", - (long long)bit); - goto have_alloc_rec; - } - if (bit != -ENOSPC) { - up_write(&vol->mftbmp_lock); - return ERR_PTR(bit); - } - /* - * No free mft records left. If the mft bitmap already covers more - * than the currently used mft records, the next records are all free, - * so we can simply allocate the first unused mft record. - * Note: We also have to make sure that the mft bitmap at least covers - * the first 24 mft records as they are special and whilst they may not - * be in use, we do not allocate from them. - */ - read_lock_irqsave(&mft_ni->size_lock, flags); - ll = mft_ni->initialized_size >> vol->mft_record_size_bits; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - old_data_initialized = mftbmp_ni->initialized_size; - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - if (old_data_initialized << 3 > ll && old_data_initialized > 3) { - bit = ll; - if (bit < 24) - bit = 24; - if (unlikely(bit >= (1ll << 32))) - goto max_err_out; - ntfs_debug("Found free record (#2), bit 0x%llx.", - (long long)bit); - goto found_free_rec; - } - /* - * The mft bitmap needs to be expanded until it covers the first unused - * mft record that we can allocate. - * Note: The smallest mft record we allocate is mft record 24. - */ - bit = old_data_initialized << 3; - if (unlikely(bit >= (1ll << 32))) - goto max_err_out; - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - old_data_size = mftbmp_ni->allocated_size; - ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, " - "data_size 0x%llx, initialized_size 0x%llx.", - (long long)old_data_size, - (long long)i_size_read(vol->mftbmp_ino), - (long long)old_data_initialized); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); - if (old_data_initialized + 8 > old_data_size) { - /* Need to extend bitmap by one more cluster. */ - ntfs_debug("mftbmp: initialized_size + 8 > allocated_size."); - err = ntfs_mft_bitmap_extend_allocation_nolock(vol); - if (unlikely(err)) { - up_write(&vol->mftbmp_lock); - goto err_out; - } -#ifdef DEBUG - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ntfs_debug("Status of mftbmp after allocation extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)i_size_read(vol->mftbmp_ino), - (long long)mftbmp_ni->initialized_size); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); -#endif /* DEBUG */ - } - /* - * We now have sufficient allocated space, extend the initialized_size - * as well as the data_size if necessary and fill the new space with - * zeroes. - */ - err = ntfs_mft_bitmap_extend_initialized_nolock(vol); - if (unlikely(err)) { - up_write(&vol->mftbmp_lock); - goto err_out; - } -#ifdef DEBUG - read_lock_irqsave(&mftbmp_ni->size_lock, flags); - ntfs_debug("Status of mftbmp after initialized extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)i_size_read(vol->mftbmp_ino), - (long long)mftbmp_ni->initialized_size); - read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); -#endif /* DEBUG */ - ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit); -found_free_rec: - /* @bit is the found free mft record, allocate it in the mft bitmap. */ - ntfs_debug("At found_free_rec."); - err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap."); - up_write(&vol->mftbmp_lock); - goto err_out; - } - ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit); -have_alloc_rec: - /* - * The mft bitmap is now uptodate. Deal with mft data attribute now. - * Note, we keep hold of the mft bitmap lock for writing until all - * modifications to the mft data attribute are complete, too, as they - * will impact decisions for mft bitmap and mft record allocation done - * by a parallel allocation and if the lock is not maintained a - * parallel allocation could allocate the same mft record as this one. - */ - ll = (bit + 1) << vol->mft_record_size_bits; - read_lock_irqsave(&mft_ni->size_lock, flags); - old_data_initialized = mft_ni->initialized_size; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - if (ll <= old_data_initialized) { - ntfs_debug("Allocated mft record already initialized."); - goto mft_rec_already_initialized; - } - ntfs_debug("Initializing allocated mft record."); - /* - * The mft record is outside the initialized data. Extend the mft data - * attribute until it covers the allocated record. The loop is only - * actually traversed more than once when a freshly formatted volume is - * first written to so it optimizes away nicely in the common case. - */ - read_lock_irqsave(&mft_ni->size_lock, flags); - ntfs_debug("Status of mft data before extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mft_ni->allocated_size, - (long long)i_size_read(vol->mft_ino), - (long long)mft_ni->initialized_size); - while (ll > mft_ni->allocated_size) { - read_unlock_irqrestore(&mft_ni->size_lock, flags); - err = ntfs_mft_data_extend_allocation_nolock(vol); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to extend mft data " - "allocation."); - goto undo_mftbmp_alloc_nolock; - } - read_lock_irqsave(&mft_ni->size_lock, flags); - ntfs_debug("Status of mft data after allocation extension: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mft_ni->allocated_size, - (long long)i_size_read(vol->mft_ino), - (long long)mft_ni->initialized_size); - } - read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* - * Extend mft data initialized size (and data size of course) to reach - * the allocated mft record, formatting the mft records allong the way. - * Note: We only modify the ntfs_inode structure as that is all that is - * needed by ntfs_mft_record_format(). We will update the attribute - * record itself in one fell swoop later on. - */ - write_lock_irqsave(&mft_ni->size_lock, flags); - old_data_initialized = mft_ni->initialized_size; - old_data_size = vol->mft_ino->i_size; - while (ll > mft_ni->initialized_size) { - s64 new_initialized_size, mft_no; - - new_initialized_size = mft_ni->initialized_size + - vol->mft_record_size; - mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits; - if (new_initialized_size > i_size_read(vol->mft_ino)) - i_size_write(vol->mft_ino, new_initialized_size); - write_unlock_irqrestore(&mft_ni->size_lock, flags); - ntfs_debug("Initializing mft record 0x%llx.", - (long long)mft_no); - err = ntfs_mft_record_format(vol, mft_no); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to format mft record."); - goto undo_data_init; - } - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->initialized_size = new_initialized_size; - } - write_unlock_irqrestore(&mft_ni->size_lock, flags); - record_formatted = true; - /* Update the mft data attribute record to reflect the new sizes. */ - m = map_mft_record(mft_ni); - if (IS_ERR(m)) { - ntfs_error(vol->sb, "Failed to map mft record."); - err = PTR_ERR(m); - goto undo_data_init; - } - ctx = ntfs_attr_get_search_ctx(mft_ni, m); - if (unlikely(!ctx)) { - ntfs_error(vol->sb, "Failed to get search context."); - err = -ENOMEM; - unmap_mft_record(mft_ni); - goto undo_data_init; - } - err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to find first attribute extent of " - "mft data attribute."); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - goto undo_data_init; - } - a = ctx->attr; - read_lock_irqsave(&mft_ni->size_lock, flags); - a->data.non_resident.initialized_size = - cpu_to_sle64(mft_ni->initialized_size); - a->data.non_resident.data_size = - cpu_to_sle64(i_size_read(vol->mft_ino)); - read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(mft_ni); - read_lock_irqsave(&mft_ni->size_lock, flags); - ntfs_debug("Status of mft data after mft record initialization: " - "allocated_size 0x%llx, data_size 0x%llx, " - "initialized_size 0x%llx.", - (long long)mft_ni->allocated_size, - (long long)i_size_read(vol->mft_ino), - (long long)mft_ni->initialized_size); - BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); - BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); - read_unlock_irqrestore(&mft_ni->size_lock, flags); -mft_rec_already_initialized: - /* - * We can finally drop the mft bitmap lock as the mft data attribute - * has been fully updated. The only disparity left is that the - * allocated mft record still needs to be marked as in use to match the - * set bit in the mft bitmap but this is actually not a problem since - * this mft record is not referenced from anywhere yet and the fact - * that it is allocated in the mft bitmap means that no-one will try to - * allocate it either. - */ - up_write(&vol->mftbmp_lock); - /* - * We now have allocated and initialized the mft record. Calculate the - * index of and the offset within the page cache page the record is in. - */ - index = bit << vol->mft_record_size_bits >> PAGE_SHIFT; - ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK; - /* Read, map, and pin the page containing the mft record. */ - page = ntfs_map_page(vol->mft_ino->i_mapping, index); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to map page containing allocated " - "mft record 0x%llx.", (long long)bit); - err = PTR_ERR(page); - goto undo_mftbmp_alloc; - } - lock_page(page); - BUG_ON(!PageUptodate(page)); - ClearPageUptodate(page); - m = (MFT_RECORD*)((u8*)page_address(page) + ofs); - /* If we just formatted the mft record no need to do it again. */ - if (!record_formatted) { - /* Sanity check that the mft record is really not in use. */ - if (ntfs_is_file_record(m->magic) && - (m->flags & MFT_RECORD_IN_USE)) { - ntfs_error(vol->sb, "Mft record 0x%llx was marked " - "free in mft bitmap but is marked " - "used itself. Corrupt filesystem. " - "Unmount and run chkdsk.", - (long long)bit); - err = -EIO; - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - NVolSetErrors(vol); - goto undo_mftbmp_alloc; - } - /* - * We need to (re-)format the mft record, preserving the - * sequence number if it is not zero as well as the update - * sequence number if it is not zero or -1 (0xffff). This - * means we do not need to care whether or not something went - * wrong with the previous mft record. - */ - seq_no = m->sequence_number; - usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)); - err = ntfs_mft_record_layout(vol, bit, m); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to layout allocated mft " - "record 0x%llx.", (long long)bit); - SetPageUptodate(page); - unlock_page(page); - ntfs_unmap_page(page); - goto undo_mftbmp_alloc; - } - if (seq_no) - m->sequence_number = seq_no; - if (usn && le16_to_cpu(usn) != 0xffff) - *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn; - } - /* Set the mft record itself in use. */ - m->flags |= MFT_RECORD_IN_USE; - if (S_ISDIR(mode)) - m->flags |= MFT_RECORD_IS_DIRECTORY; - flush_dcache_page(page); - SetPageUptodate(page); - if (base_ni) { - MFT_RECORD *m_tmp; - - /* - * Setup the base mft record in the extent mft record. This - * completes initialization of the allocated extent mft record - * and we can simply use it with map_extent_mft_record(). - */ - m->base_mft_record = MK_LE_MREF(base_ni->mft_no, - base_ni->seq_no); - /* - * Allocate an extent inode structure for the new mft record, - * attach it to the base inode @base_ni and map, pin, and lock - * its, i.e. the allocated, mft record. - */ - m_tmp = map_extent_mft_record(base_ni, bit, &ni); - if (IS_ERR(m_tmp)) { - ntfs_error(vol->sb, "Failed to map allocated extent " - "mft record 0x%llx.", (long long)bit); - err = PTR_ERR(m_tmp); - /* Set the mft record itself not in use. */ - m->flags &= cpu_to_le16( - ~le16_to_cpu(MFT_RECORD_IN_USE)); - flush_dcache_page(page); - /* Make sure the mft record is written out to disk. */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - ntfs_unmap_page(page); - goto undo_mftbmp_alloc; - } - BUG_ON(m != m_tmp); - /* - * Make sure the allocated mft record is written out to disk. - * No need to set the inode dirty because the caller is going - * to do that anyway after finishing with the new extent mft - * record (e.g. at a minimum a new attribute will be added to - * the mft record. - */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - /* - * Need to unmap the page since map_extent_mft_record() mapped - * it as well so we have it mapped twice at the moment. - */ - ntfs_unmap_page(page); - } else { - /* - * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink - * is set to 1 but the mft record->link_count is 0. The caller - * needs to bear this in mind. - */ - vi = new_inode(vol->sb); - if (unlikely(!vi)) { - err = -ENOMEM; - /* Set the mft record itself not in use. */ - m->flags &= cpu_to_le16( - ~le16_to_cpu(MFT_RECORD_IN_USE)); - flush_dcache_page(page); - /* Make sure the mft record is written out to disk. */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - ntfs_unmap_page(page); - goto undo_mftbmp_alloc; - } - vi->i_ino = bit; - - /* The owner and group come from the ntfs volume. */ - vi->i_uid = vol->uid; - vi->i_gid = vol->gid; - - /* Initialize the ntfs specific part of @vi. */ - ntfs_init_big_inode(vi); - ni = NTFS_I(vi); - /* - * Set the appropriate mode, attribute type, and name. For - * directories, also setup the index values to the defaults. - */ - if (S_ISDIR(mode)) { - vi->i_mode = S_IFDIR | S_IRWXUGO; - vi->i_mode &= ~vol->dmask; - - NInoSetMstProtected(ni); - ni->type = AT_INDEX_ALLOCATION; - ni->name = I30; - ni->name_len = 4; - - ni->itype.index.block_size = 4096; - ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1; - ni->itype.index.collation_rule = COLLATION_FILE_NAME; - if (vol->cluster_size <= ni->itype.index.block_size) { - ni->itype.index.vcn_size = vol->cluster_size; - ni->itype.index.vcn_size_bits = - vol->cluster_size_bits; - } else { - ni->itype.index.vcn_size = vol->sector_size; - ni->itype.index.vcn_size_bits = - vol->sector_size_bits; - } - } else { - vi->i_mode = S_IFREG | S_IRWXUGO; - vi->i_mode &= ~vol->fmask; - - ni->type = AT_DATA; - ni->name = NULL; - ni->name_len = 0; - } - if (IS_RDONLY(vi)) - vi->i_mode &= ~S_IWUGO; - - /* Set the inode times to the current time. */ - simple_inode_init_ts(vi); - /* - * Set the file size to 0, the ntfs inode sizes are set to 0 by - * the call to ntfs_init_big_inode() below. - */ - vi->i_size = 0; - vi->i_blocks = 0; - - /* Set the sequence number. */ - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - /* - * Manually map, pin, and lock the mft record as we already - * have its page mapped and it is very easy to do. - */ - atomic_inc(&ni->count); - mutex_lock(&ni->mrec_lock); - ni->page = page; - ni->page_ofs = ofs; - /* - * Make sure the allocated mft record is written out to disk. - * NOTE: We do not set the ntfs inode dirty because this would - * fail in ntfs_write_inode() because the inode does not have a - * standard information attribute yet. Also, there is no need - * to set the inode dirty because the caller is going to do - * that anyway after finishing with the new mft record (e.g. at - * a minimum some new attributes will be added to the mft - * record. - */ - mark_ntfs_record_dirty(page, ofs); - unlock_page(page); - - /* Add the inode to the inode hash for the superblock. */ - insert_inode_hash(vi); - - /* Update the default mft allocation position. */ - vol->mft_data_pos = bit + 1; - } - /* - * Return the opened, allocated inode of the allocated mft record as - * well as the mapped, pinned, and locked mft record. - */ - ntfs_debug("Returning opened, allocated %sinode 0x%llx.", - base_ni ? "extent " : "", (long long)bit); - *mrec = m; - return ni; -undo_data_init: - write_lock_irqsave(&mft_ni->size_lock, flags); - mft_ni->initialized_size = old_data_initialized; - i_size_write(vol->mft_ino, old_data_size); - write_unlock_irqrestore(&mft_ni->size_lock, flags); - goto undo_mftbmp_alloc_nolock; -undo_mftbmp_alloc: - down_write(&vol->mftbmp_lock); -undo_mftbmp_alloc_nolock: - if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) { - ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); - NVolSetErrors(vol); - } - up_write(&vol->mftbmp_lock); -err_out: - return ERR_PTR(err); -max_err_out: - ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum " - "number of inodes (2^32) has already been reached."); - up_write(&vol->mftbmp_lock); - return ERR_PTR(-ENOSPC); -} - -/** - * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume - * @ni: ntfs inode of the mapped extent mft record to free - * @m: mapped extent mft record of the ntfs inode @ni - * - * Free the mapped extent mft record @m of the extent ntfs inode @ni. - * - * Note that this function unmaps the mft record and closes and destroys @ni - * internally and hence you cannot use either @ni nor @m any more after this - * function returns success. - * - * On success return 0 and on error return -errno. @ni and @m are still valid - * in this case and have not been freed. - * - * For some errors an error message is displayed and the success code 0 is - * returned and the volume is then left dirty on umount. This makes sense in - * case we could not rollback the changes that were already done since the - * caller no longer wants to reference this mft record so it does not matter to - * the caller if something is wrong with it as long as it is properly detached - * from the base inode. - */ -int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) -{ - unsigned long mft_no = ni->mft_no; - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - ntfs_inode **extent_nis; - int i, err; - le16 old_seq_no; - u16 seq_no; - - BUG_ON(NInoAttr(ni)); - BUG_ON(ni->nr_extents != -1); - - mutex_lock(&ni->extent_lock); - base_ni = ni->ext.base_ntfs_ino; - mutex_unlock(&ni->extent_lock); - - BUG_ON(base_ni->nr_extents <= 0); - - ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n", - mft_no, base_ni->mft_no); - - mutex_lock(&base_ni->extent_lock); - - /* Make sure we are holding the only reference to the extent inode. */ - if (atomic_read(&ni->count) > 2) { - ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, " - "not freeing.", base_ni->mft_no); - mutex_unlock(&base_ni->extent_lock); - return -EBUSY; - } - - /* Dissociate the ntfs inode from the base inode. */ - extent_nis = base_ni->ext.extent_ntfs_inos; - err = -ENOENT; - for (i = 0; i < base_ni->nr_extents; i++) { - if (ni != extent_nis[i]) - continue; - extent_nis += i; - base_ni->nr_extents--; - memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) * - sizeof(ntfs_inode*)); - err = 0; - break; - } - - mutex_unlock(&base_ni->extent_lock); - - if (unlikely(err)) { - ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to " - "its base inode 0x%lx.", mft_no, - base_ni->mft_no); - BUG(); - } - - /* - * The extent inode is no longer attached to the base inode so no one - * can get a reference to it any more. - */ - - /* Mark the mft record as not in use. */ - m->flags &= ~MFT_RECORD_IN_USE; - - /* Increment the sequence number, skipping zero, if it is not zero. */ - old_seq_no = m->sequence_number; - seq_no = le16_to_cpu(old_seq_no); - if (seq_no == 0xffff) - seq_no = 1; - else if (seq_no) - seq_no++; - m->sequence_number = cpu_to_le16(seq_no); - - /* - * Set the ntfs inode dirty and write it out. We do not need to worry - * about the base inode here since whatever caused the extent mft - * record to be freed is guaranteed to do it already. - */ - NInoSetDirty(ni); - err = write_mft_record(ni, m, 0); - if (unlikely(err)) { - ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not " - "freeing.", mft_no); - goto rollback; - } -rollback_error: - /* Unmap and throw away the now freed extent inode. */ - unmap_extent_mft_record(ni); - ntfs_clear_extent_inode(ni); - - /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */ - down_write(&vol->mftbmp_lock); - err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no); - up_write(&vol->mftbmp_lock); - if (unlikely(err)) { - /* - * The extent inode is gone but we failed to deallocate it in - * the mft bitmap. Just emit a warning and leave the volume - * dirty on umount. - */ - ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); - NVolSetErrors(vol); - } - return 0; -rollback: - /* Rollback what we did... */ - mutex_lock(&base_ni->extent_lock); - extent_nis = base_ni->ext.extent_ntfs_inos; - if (!(base_ni->nr_extents & 3)) { - int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*); - - extent_nis = kmalloc(new_size, GFP_NOFS); - if (unlikely(!extent_nis)) { - ntfs_error(vol->sb, "Failed to allocate internal " - "buffer during rollback.%s", es); - mutex_unlock(&base_ni->extent_lock); - NVolSetErrors(vol); - goto rollback_error; - } - if (base_ni->nr_extents) { - BUG_ON(!base_ni->ext.extent_ntfs_inos); - memcpy(extent_nis, base_ni->ext.extent_ntfs_inos, - new_size - 4 * sizeof(ntfs_inode*)); - kfree(base_ni->ext.extent_ntfs_inos); - } - base_ni->ext.extent_ntfs_inos = extent_nis; - } - m->flags |= MFT_RECORD_IN_USE; - m->sequence_number = old_seq_no; - extent_nis[base_ni->nr_extents++] = ni; - mutex_unlock(&base_ni->extent_lock); - mark_mft_record_dirty(ni); - return err; -} -#endif /* NTFS_RW */ diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h deleted file mode 100644 index 49c001af16ed..000000000000 --- a/fs/ntfs/mft.h +++ /dev/null @@ -1,110 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * mft.h - Defines for mft record handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_MFT_H -#define _LINUX_NTFS_MFT_H - -#include <linux/fs.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> - -#include "inode.h" - -extern MFT_RECORD *map_mft_record(ntfs_inode *ni); -extern void unmap_mft_record(ntfs_inode *ni); - -extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, - ntfs_inode **ntfs_ino); - -static inline void unmap_extent_mft_record(ntfs_inode *ni) -{ - unmap_mft_record(ni); - return; -} - -#ifdef NTFS_RW - -/** - * flush_dcache_mft_record_page - flush_dcache_page() for mft records - * @ni: ntfs inode structure of mft record - * - * Call flush_dcache_page() for the page in which an mft record resides. - * - * This must be called every time an mft record is modified, just after the - * modification. - */ -static inline void flush_dcache_mft_record_page(ntfs_inode *ni) -{ - flush_dcache_page(ni->page); -} - -extern void __mark_mft_record_dirty(ntfs_inode *ni); - -/** - * mark_mft_record_dirty - set the mft record and the page containing it dirty - * @ni: ntfs inode describing the mapped mft record - * - * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, - * as well as the page containing the mft record, dirty. Also, mark the base - * vfs inode dirty. This ensures that any changes to the mft record are - * written out to disk. - * - * NOTE: Do not do anything if the mft record is already marked dirty. - */ -static inline void mark_mft_record_dirty(ntfs_inode *ni) -{ - if (!NInoTestSetDirty(ni)) - __mark_mft_record_dirty(ni); -} - -extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, - MFT_RECORD *m, int sync); - -extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); - -/** - * write_mft_record - write out a mapped (extent) mft record - * @ni: ntfs inode describing the mapped (extent) mft record - * @m: mapped (extent) mft record to write - * @sync: if true, wait for i/o completion - * - * This is just a wrapper for write_mft_record_nolock() (see mft.c), which - * locks the page for the duration of the write. This ensures that there are - * no race conditions between writing the mft record via the dirty inode code - * paths and via the page cache write back code paths or between writing - * neighbouring mft records residing in the same page. - * - * Locking the page also serializes us against ->read_folio() if the page is not - * uptodate. - * - * On success, clean the mft record and return 0. On error, leave the mft - * record dirty and return -errno. - */ -static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) -{ - struct page *page = ni->page; - int err; - - BUG_ON(!page); - lock_page(page); - err = write_mft_record_nolock(ni, m, sync); - unlock_page(page); - return err; -} - -extern bool ntfs_may_write_mft_record(ntfs_volume *vol, - const unsigned long mft_no, const MFT_RECORD *m, - ntfs_inode **locked_ni); - -extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, - ntfs_inode *base_ni, MFT_RECORD **mrec); -extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_MFT_H */ diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c deleted file mode 100644 index 16b3c884abfc..000000000000 --- a/fs/ntfs/mst.c +++ /dev/null @@ -1,189 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * mst.c - NTFS multi sector transfer protection handling code. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2001-2004 Anton Altaparmakov - */ - -#include "ntfs.h" - -/** - * post_read_mst_fixup - deprotect multi sector transfer protected data - * @b: pointer to the data to deprotect - * @size: size in bytes of @b - * - * Perform the necessary post read multi sector transfer fixup and detect the - * presence of incomplete multi sector transfers. - In that case, overwrite the - * magic of the ntfs record header being processed with "BAAD" (in memory only!) - * and abort processing. - * - * Return 0 on success and -EINVAL on error ("BAAD" magic will be present). - * - * NOTE: We consider the absence / invalidity of an update sequence array to - * mean that the structure is not protected at all and hence doesn't need to - * be fixed up. Thus, we return success and not failure in this case. This is - * in contrast to pre_write_mst_fixup(), see below. - */ -int post_read_mst_fixup(NTFS_RECORD *b, const u32 size) -{ - u16 usa_ofs, usa_count, usn; - u16 *usa_pos, *data_pos; - - /* Setup the variables. */ - usa_ofs = le16_to_cpu(b->usa_ofs); - /* Decrement usa_count to get number of fixups. */ - usa_count = le16_to_cpu(b->usa_count) - 1; - /* Size and alignment checks. */ - if ( size & (NTFS_BLOCK_SIZE - 1) || - usa_ofs & 1 || - usa_ofs + (usa_count * 2) > size || - (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) - return 0; - /* Position of usn in update sequence array. */ - usa_pos = (u16*)b + usa_ofs/sizeof(u16); - /* - * The update sequence number which has to be equal to each of the - * u16 values before they are fixed up. Note no need to care for - * endianness since we are comparing and moving data for on disk - * structures which means the data is consistent. - If it is - * consistenty the wrong endianness it doesn't make any difference. - */ - usn = *usa_pos; - /* - * Position in protected data of first u16 that needs fixing up. - */ - data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; - /* - * Check for incomplete multi sector transfer(s). - */ - while (usa_count--) { - if (*data_pos != usn) { - /* - * Incomplete multi sector transfer detected! )-: - * Set the magic to "BAAD" and return failure. - * Note that magic_BAAD is already converted to le32. - */ - b->magic = magic_BAAD; - return -EINVAL; - } - data_pos += NTFS_BLOCK_SIZE/sizeof(u16); - } - /* Re-setup the variables. */ - usa_count = le16_to_cpu(b->usa_count) - 1; - data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; - /* Fixup all sectors. */ - while (usa_count--) { - /* - * Increment position in usa and restore original data from - * the usa into the data buffer. - */ - *data_pos = *(++usa_pos); - /* Increment position in data as well. */ - data_pos += NTFS_BLOCK_SIZE/sizeof(u16); - } - return 0; -} - -/** - * pre_write_mst_fixup - apply multi sector transfer protection - * @b: pointer to the data to protect - * @size: size in bytes of @b - * - * Perform the necessary pre write multi sector transfer fixup on the data - * pointer to by @b of @size. - * - * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed - * (assumed not needed). This is in contrast to post_read_mst_fixup() above. - * - * NOTE: We consider the absence / invalidity of an update sequence array to - * mean that the structure is not subject to protection and hence doesn't need - * to be fixed up. This means that you have to create a valid update sequence - * array header in the ntfs record before calling this function, otherwise it - * will fail (the header needs to contain the position of the update sequence - * array together with the number of elements in the array). You also need to - * initialise the update sequence number before calling this function - * otherwise a random word will be used (whatever was in the record at that - * position at that time). - */ -int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size) -{ - le16 *usa_pos, *data_pos; - u16 usa_ofs, usa_count, usn; - le16 le_usn; - - /* Sanity check + only fixup if it makes sense. */ - if (!b || ntfs_is_baad_record(b->magic) || - ntfs_is_hole_record(b->magic)) - return -EINVAL; - /* Setup the variables. */ - usa_ofs = le16_to_cpu(b->usa_ofs); - /* Decrement usa_count to get number of fixups. */ - usa_count = le16_to_cpu(b->usa_count) - 1; - /* Size and alignment checks. */ - if ( size & (NTFS_BLOCK_SIZE - 1) || - usa_ofs & 1 || - usa_ofs + (usa_count * 2) > size || - (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) - return -EINVAL; - /* Position of usn in update sequence array. */ - usa_pos = (le16*)((u8*)b + usa_ofs); - /* - * Cyclically increment the update sequence number - * (skipping 0 and -1, i.e. 0xffff). - */ - usn = le16_to_cpup(usa_pos) + 1; - if (usn == 0xffff || !usn) - usn = 1; - le_usn = cpu_to_le16(usn); - *usa_pos = le_usn; - /* Position in data of first u16 that needs fixing up. */ - data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; - /* Fixup all sectors. */ - while (usa_count--) { - /* - * Increment the position in the usa and save the - * original data from the data buffer into the usa. - */ - *(++usa_pos) = *data_pos; - /* Apply fixup to data. */ - *data_pos = le_usn; - /* Increment position in data as well. */ - data_pos += NTFS_BLOCK_SIZE/sizeof(le16); - } - return 0; -} - -/** - * post_write_mst_fixup - fast deprotect multi sector transfer protected data - * @b: pointer to the data to deprotect - * - * Perform the necessary post write multi sector transfer fixup, not checking - * for any errors, because we assume we have just used pre_write_mst_fixup(), - * thus the data will be fine or we would never have gotten here. - */ -void post_write_mst_fixup(NTFS_RECORD *b) -{ - le16 *usa_pos, *data_pos; - - u16 usa_ofs = le16_to_cpu(b->usa_ofs); - u16 usa_count = le16_to_cpu(b->usa_count) - 1; - - /* Position of usn in update sequence array. */ - usa_pos = (le16*)b + usa_ofs/sizeof(le16); - - /* Position in protected data of first u16 that needs fixing up. */ - data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; - - /* Fixup all sectors. */ - while (usa_count--) { - /* - * Increment position in usa and restore original data from - * the usa into the data buffer. - */ - *data_pos = *(++usa_pos); - - /* Increment position in data as well. */ - data_pos += NTFS_BLOCK_SIZE/sizeof(le16); - } -} diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c deleted file mode 100644 index d7498ddc4a72..000000000000 --- a/fs/ntfs/namei.c +++ /dev/null @@ -1,392 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2001-2006 Anton Altaparmakov - */ - -#include <linux/dcache.h> -#include <linux/exportfs.h> -#include <linux/security.h> -#include <linux/slab.h> - -#include "attrib.h" -#include "debug.h" -#include "dir.h" -#include "mft.h" -#include "ntfs.h" - -/** - * ntfs_lookup - find the inode represented by a dentry in a directory inode - * @dir_ino: directory inode in which to look for the inode - * @dent: dentry representing the inode to look for - * @flags: lookup flags - * - * In short, ntfs_lookup() looks for the inode represented by the dentry @dent - * in the directory inode @dir_ino and if found attaches the inode to the - * dentry @dent. - * - * In more detail, the dentry @dent specifies which inode to look for by - * supplying the name of the inode in @dent->d_name.name. ntfs_lookup() - * converts the name to Unicode and walks the contents of the directory inode - * @dir_ino looking for the converted Unicode name. If the name is found in the - * directory, the corresponding inode is loaded by calling ntfs_iget() on its - * inode number and the inode is associated with the dentry @dent via a call to - * d_splice_alias(). - * - * If the name is not found in the directory, a NULL inode is inserted into the - * dentry @dent via a call to d_add(). The dentry is then termed a negative - * dentry. - * - * Only if an actual error occurs, do we return an error via ERR_PTR(). - * - * In order to handle the case insensitivity issues of NTFS with regards to the - * dcache and the dcache requiring only one dentry per directory, we deal with - * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining - * a case sensitive dcache. This means that we get the full benefit of dcache - * speed when the file/directory is looked up with the same case as returned by - * ->ntfs_readdir() but that a lookup for any other case (or for the short file - * name) will not find anything in dcache and will enter ->ntfs_lookup() - * instead, where we search the directory for a fully matching file name - * (including case) and if that is not found, we search for a file name that - * matches with different case and if that has non-POSIX semantics we return - * that. We actually do only one search (case sensitive) and keep tabs on - * whether we have found a case insensitive match in the process. - * - * To simplify matters for us, we do not treat the short vs long filenames as - * two hard links but instead if the lookup matches a short filename, we - * return the dentry for the corresponding long filename instead. - * - * There are three cases we need to distinguish here: - * - * 1) @dent perfectly matches (i.e. including case) a directory entry with a - * file name in the WIN32 or POSIX namespaces. In this case - * ntfs_lookup_inode_by_name() will return with name set to NULL and we - * just d_splice_alias() @dent. - * 2) @dent matches (not including case) a directory entry with a file name in - * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return - * with name set to point to a kmalloc()ed ntfs_name structure containing - * the properly cased little endian Unicode name. We convert the name to the - * current NLS code page, search if a dentry with this name already exists - * and if so return that instead of @dent. At this point things are - * complicated by the possibility of 'disconnected' dentries due to NFS - * which we deal with appropriately (see the code comments). The VFS will - * then destroy the old @dent and use the one we returned. If a dentry is - * not found, we allocate a new one, d_splice_alias() it, and return it as - * above. - * 3) @dent matches either perfectly or not (i.e. we don't care about case) a - * directory entry with a file name in the DOS namespace. In this case - * ntfs_lookup_inode_by_name() will return with name set to point to a - * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian) - * of the inode. We use the mft reference to read the inode and to find the - * file name in the WIN32 namespace corresponding to the matched short file - * name. We then convert the name to the current NLS code page, and proceed - * searching for a dentry with this name, etc, as in case 2), above. - * - * Locking: Caller must hold i_mutex on the directory. - */ -static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, - unsigned int flags) -{ - ntfs_volume *vol = NTFS_SB(dir_ino->i_sb); - struct inode *dent_inode; - ntfschar *uname; - ntfs_name *name = NULL; - MFT_REF mref; - unsigned long dent_ino; - int uname_len; - - ntfs_debug("Looking up %pd in directory inode 0x%lx.", - dent, dir_ino->i_ino); - /* Convert the name of the dentry to Unicode. */ - uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len, - &uname); - if (uname_len < 0) { - if (uname_len != -ENAMETOOLONG) - ntfs_error(vol->sb, "Failed to convert name to " - "Unicode."); - return ERR_PTR(uname_len); - } - mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len, - &name); - kmem_cache_free(ntfs_name_cache, uname); - if (!IS_ERR_MREF(mref)) { - dent_ino = MREF(mref); - ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); - dent_inode = ntfs_iget(vol->sb, dent_ino); - if (!IS_ERR(dent_inode)) { - /* Consistency check. */ - if (is_bad_inode(dent_inode) || MSEQNO(mref) == - NTFS_I(dent_inode)->seq_no || - dent_ino == FILE_MFT) { - /* Perfect WIN32/POSIX match. -- Case 1. */ - if (!name) { - ntfs_debug("Done. (Case 1.)"); - return d_splice_alias(dent_inode, dent); - } - /* - * We are too indented. Handle imperfect - * matches and short file names further below. - */ - goto handle_name; - } - ntfs_error(vol->sb, "Found stale reference to inode " - "0x%lx (reference sequence number = " - "0x%x, inode sequence number = 0x%x), " - "returning -EIO. Run chkdsk.", - dent_ino, MSEQNO(mref), - NTFS_I(dent_inode)->seq_no); - iput(dent_inode); - dent_inode = ERR_PTR(-EIO); - } else - ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with " - "error code %li.", dent_ino, - PTR_ERR(dent_inode)); - kfree(name); - /* Return the error code. */ - return ERR_CAST(dent_inode); - } - /* It is guaranteed that @name is no longer allocated at this point. */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("Entry was not found, adding negative dentry."); - /* The dcache will handle negative entries. */ - d_add(dent, NULL); - ntfs_debug("Done."); - return NULL; - } - ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error " - "code %i.", -MREF_ERR(mref)); - return ERR_PTR(MREF_ERR(mref)); - // TODO: Consider moving this lot to a separate function! (AIA) -handle_name: - { - MFT_RECORD *m; - ntfs_attr_search_ctx *ctx; - ntfs_inode *ni = NTFS_I(dent_inode); - int err; - struct qstr nls_name; - - nls_name.name = NULL; - if (name->type != FILE_NAME_DOS) { /* Case 2. */ - ntfs_debug("Case 2."); - nls_name.len = (unsigned)ntfs_ucstonls(vol, - (ntfschar*)&name->name, name->len, - (unsigned char**)&nls_name.name, 0); - kfree(name); - } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */ - FILE_NAME_ATTR *fn; - - ntfs_debug("Case 3."); - kfree(name); - - /* Find the WIN32 name corresponding to the matched DOS name. */ - ni = NTFS_I(dent_inode); - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - do { - ATTR_RECORD *a; - u32 val_len; - - err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, - NULL, 0, ctx); - if (unlikely(err)) { - ntfs_error(vol->sb, "Inode corrupt: No WIN32 " - "namespace counterpart to DOS " - "file name. Run chkdsk."); - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - /* Consistency checks. */ - a = ctx->attr; - if (a->non_resident || a->flags) - goto eio_err_out; - val_len = le32_to_cpu(a->data.resident.value_length); - if (le16_to_cpu(a->data.resident.value_offset) + - val_len > le32_to_cpu(a->length)) - goto eio_err_out; - fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu( - ctx->attr->data.resident.value_offset)); - if ((u32)(fn->file_name_length * sizeof(ntfschar) + - sizeof(FILE_NAME_ATTR)) > val_len) - goto eio_err_out; - } while (fn->file_name_type != FILE_NAME_WIN32); - - /* Convert the found WIN32 name to current NLS code page. */ - nls_name.len = (unsigned)ntfs_ucstonls(vol, - (ntfschar*)&fn->file_name, fn->file_name_length, - (unsigned char**)&nls_name.name, 0); - - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - } - m = NULL; - ctx = NULL; - - /* Check if a conversion error occurred. */ - if ((signed)nls_name.len < 0) { - err = (signed)nls_name.len; - goto err_out; - } - nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len); - - dent = d_add_ci(dent, dent_inode, &nls_name); - kfree(nls_name.name); - return dent; - -eio_err_out: - ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); - err = -EIO; -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ni); - iput(dent_inode); - ntfs_error(vol->sb, "Failed, returning error code %i.", err); - return ERR_PTR(err); - } -} - -/* - * Inode operations for directories. - */ -const struct inode_operations ntfs_dir_inode_ops = { - .lookup = ntfs_lookup, /* VFS: Lookup directory. */ -}; - -/** - * ntfs_get_parent - find the dentry of the parent of a given directory dentry - * @child_dent: dentry of the directory whose parent directory to find - * - * Find the dentry for the parent directory of the directory specified by the - * dentry @child_dent. This function is called from - * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the - * default ->decode_fh() which is export_decode_fh() in the same file. - * - * The code is based on the ext3 ->get_parent() implementation found in - * fs/ext3/namei.c::ext3_get_parent(). - * - * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down. - * - * Return the dentry of the parent directory on success or the error code on - * error (IS_ERR() is true). - */ -static struct dentry *ntfs_get_parent(struct dentry *child_dent) -{ - struct inode *vi = d_inode(child_dent); - ntfs_inode *ni = NTFS_I(vi); - MFT_RECORD *mrec; - ntfs_attr_search_ctx *ctx; - ATTR_RECORD *attr; - FILE_NAME_ATTR *fn; - unsigned long parent_ino; - int err; - - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - /* Get the mft record of the inode belonging to the child dentry. */ - mrec = map_mft_record(ni); - if (IS_ERR(mrec)) - return ERR_CAST(mrec); - /* Find the first file name attribute in the mft record. */ - ctx = ntfs_attr_get_search_ctx(ni, mrec); - if (unlikely(!ctx)) { - unmap_mft_record(ni); - return ERR_PTR(-ENOMEM); - } -try_next: - err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - if (err == -ENOENT) - ntfs_error(vi->i_sb, "Inode 0x%lx does not have a " - "file name attribute. Run chkdsk.", - vi->i_ino); - return ERR_PTR(err); - } - attr = ctx->attr; - if (unlikely(attr->non_resident)) - goto try_next; - fn = (FILE_NAME_ATTR *)((u8 *)attr + - le16_to_cpu(attr->data.resident.value_offset)); - if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) > - (u8*)attr + le32_to_cpu(attr->length))) - goto try_next; - /* Get the inode number of the parent directory. */ - parent_ino = MREF_LE(fn->parent_directory); - /* Release the search context and the mft record of the child. */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); - - return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino)); -} - -static struct inode *ntfs_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) -{ - struct inode *inode; - - inode = ntfs_iget(sb, ino); - if (!IS_ERR(inode)) { - if (is_bad_inode(inode) || inode->i_generation != generation) { - iput(inode); - inode = ERR_PTR(-ESTALE); - } - } - - return inode; -} - -static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - ntfs_nfs_get_inode); -} - -static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - ntfs_nfs_get_inode); -} - -/* - * Export operations allowing NFS exporting of mounted NTFS partitions. - * - * We use the default ->encode_fh() for now. Note that they - * use 32 bits to store the inode number which is an unsigned long so on 64-bit - * architectures is usually 64 bits so it would all fail horribly on huge - * volumes. I guess we need to define our own encode and decode fh functions - * that store 64-bit inode numbers at some point but for now we will ignore the - * problem... - * - * We also use the default ->get_name() helper (used by ->decode_fh() via - * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs - * independent. - * - * The default ->get_parent() just returns -EACCES so we have to provide our - * own and the default ->get_dentry() is incompatible with NTFS due to not - * allowing the inode number 0 which is used in NTFS for the system file $MFT - * and due to using iget() whereas NTFS needs ntfs_iget(). - */ -const struct export_operations ntfs_export_ops = { - .encode_fh = generic_encode_ino32_fh, - .get_parent = ntfs_get_parent, /* Find the parent of a given - directory. */ - .fh_to_dentry = ntfs_fh_to_dentry, - .fh_to_parent = ntfs_fh_to_parent, -}; diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h deleted file mode 100644 index e81376ea9152..000000000000 --- a/fs/ntfs/ntfs.h +++ /dev/null @@ -1,150 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ntfs.h - Defines for NTFS Linux kernel driver. - * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. - * Copyright (C) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_H -#define _LINUX_NTFS_H - -#include <linux/stddef.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> -#include <linux/nls.h> -#include <linux/smp.h> -#include <linux/pagemap.h> - -#include "types.h" -#include "volume.h" -#include "layout.h" - -typedef enum { - NTFS_BLOCK_SIZE = 512, - NTFS_BLOCK_SIZE_BITS = 9, - NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */ - NTFS_MAX_NAME_LEN = 255, - NTFS_MAX_ATTR_NAME_LEN = 255, - NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ - NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE, -} NTFS_CONSTANTS; - -/* Global variables. */ - -/* Slab caches (from super.c). */ -extern struct kmem_cache *ntfs_name_cache; -extern struct kmem_cache *ntfs_inode_cache; -extern struct kmem_cache *ntfs_big_inode_cache; -extern struct kmem_cache *ntfs_attr_ctx_cache; -extern struct kmem_cache *ntfs_index_ctx_cache; - -/* The various operations structs defined throughout the driver files. */ -extern const struct address_space_operations ntfs_normal_aops; -extern const struct address_space_operations ntfs_compressed_aops; -extern const struct address_space_operations ntfs_mst_aops; - -extern const struct file_operations ntfs_file_ops; -extern const struct inode_operations ntfs_file_inode_ops; - -extern const struct file_operations ntfs_dir_ops; -extern const struct inode_operations ntfs_dir_inode_ops; - -extern const struct file_operations ntfs_empty_file_ops; -extern const struct inode_operations ntfs_empty_inode_ops; - -extern const struct export_operations ntfs_export_ops; - -/** - * NTFS_SB - return the ntfs volume given a vfs super block - * @sb: VFS super block - * - * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb. - */ -static inline ntfs_volume *NTFS_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* Declarations of functions and global variables. */ - -/* From fs/ntfs/compress.c */ -extern int ntfs_read_compressed_block(struct page *page); -extern int allocate_compression_buffers(void); -extern void free_compression_buffers(void); - -/* From fs/ntfs/super.c */ -#define default_upcase_len 0x10000 -extern struct mutex ntfs_lock; - -typedef struct { - int val; - char *str; -} option_t; -extern const option_t on_errors_arr[]; - -/* From fs/ntfs/mst.c */ -extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size); -extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size); -extern void post_write_mst_fixup(NTFS_RECORD *b); - -/* From fs/ntfs/unistr.c */ -extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, - const ntfschar *s2, size_t s2_len, - const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_size); -extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, - const ntfschar *name2, const u32 name2_len, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n); -extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, - const ntfschar *upcase, const u32 upcase_size); -extern void ntfs_upcase_name(ntfschar *name, u32 name_len, - const ntfschar *upcase, const u32 upcase_len); -extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, - FILE_NAME_ATTR *file_name_attr2, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, - const int ins_len, ntfschar **outs); -extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, - const int ins_len, unsigned char **outs, int outs_len); - -/* From fs/ntfs/upcase.c */ -extern ntfschar *generate_default_upcase(void); - -static inline int ntfs_ffs(int x) -{ - int r = 1; - - if (!x) - return 0; - if (!(x & 0xffff)) { - x >>= 16; - r += 16; - } - if (!(x & 0xff)) { - x >>= 8; - r += 8; - } - if (!(x & 0xf)) { - x >>= 4; - r += 4; - } - if (!(x & 3)) { - x >>= 2; - r += 2; - } - if (!(x & 1)) { - x >>= 1; - r += 1; - } - return r; -} - -#endif /* _LINUX_NTFS_H */ diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c deleted file mode 100644 index 9160480222fd..000000000000 --- a/fs/ntfs/quota.c +++ /dev/null @@ -1,103 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS - * project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include "index.h" -#include "quota.h" -#include "debug.h" -#include "ntfs.h" - -/** - * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume - * @vol: ntfs volume on which to mark the quotas out of date - * - * Mark the quotas out of date on the ntfs volume @vol and return 'true' on - * success and 'false' on error. - */ -bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) -{ - ntfs_index_context *ictx; - QUOTA_CONTROL_ENTRY *qce; - const le32 qid = QUOTA_DEFAULTS_ID; - int err; - - ntfs_debug("Entering."); - if (NVolQuotaOutOfDate(vol)) - goto done; - if (!vol->quota_ino || !vol->quota_q_ino) { - ntfs_error(vol->sb, "Quota inodes are not open."); - return false; - } - inode_lock(vol->quota_q_ino); - ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); - if (!ictx) { - ntfs_error(vol->sb, "Failed to get index context."); - goto err_out; - } - err = ntfs_index_lookup(&qid, sizeof(qid), ictx); - if (err) { - if (err == -ENOENT) - ntfs_error(vol->sb, "Quota defaults entry is not " - "present."); - else - ntfs_error(vol->sb, "Lookup of quota defaults entry " - "failed."); - goto err_out; - } - if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) { - ntfs_error(vol->sb, "Quota defaults entry size is invalid. " - "Run chkdsk."); - goto err_out; - } - qce = (QUOTA_CONTROL_ENTRY*)ictx->data; - if (le32_to_cpu(qce->version) != QUOTA_VERSION) { - ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not " - "supported.", le32_to_cpu(qce->version)); - goto err_out; - } - ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags)); - /* If quotas are already marked out of date, no need to do anything. */ - if (qce->flags & QUOTA_FLAG_OUT_OF_DATE) - goto set_done; - /* - * If quota tracking is neither requested, nor enabled and there are no - * pending deletes, no need to mark the quotas out of date. - */ - if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED | - QUOTA_FLAG_TRACKING_REQUESTED | - QUOTA_FLAG_PENDING_DELETES))) - goto set_done; - /* - * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date. - * This is verified on WinXP to be sufficient to cause windows to - * rescan the volume on boot and update all quota entries. - */ - qce->flags |= QUOTA_FLAG_OUT_OF_DATE; - /* Ensure the modified flags are written to disk. */ - ntfs_index_entry_flush_dcache_page(ictx); - ntfs_index_entry_mark_dirty(ictx); -set_done: - ntfs_index_ctx_put(ictx); - inode_unlock(vol->quota_q_ino); - /* - * We set the flag so we do not try to mark the quotas out of date - * again on remount. - */ - NVolSetQuotaOutOfDate(vol); -done: - ntfs_debug("Done."); - return true; -err_out: - if (ictx) - ntfs_index_ctx_put(ictx); - inode_unlock(vol->quota_q_ino); - return false; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h deleted file mode 100644 index fe3132a3d6d2..000000000000 --- a/fs/ntfs/quota.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_QUOTA_H -#define _LINUX_NTFS_QUOTA_H - -#ifdef NTFS_RW - -#include "types.h" -#include "volume.h" - -extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_QUOTA_H */ diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c deleted file mode 100644 index 0d448e9881f7..000000000000 --- a/fs/ntfs/runlist.c +++ /dev/null @@ -1,1893 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (c) 2002-2005 Richard Russon - */ - -#include "debug.h" -#include "dir.h" -#include "endian.h" -#include "malloc.h" -#include "ntfs.h" - -/** - * ntfs_rl_mm - runlist memmove - * - * It is up to the caller to serialize access to the runlist @base. - */ -static inline void ntfs_rl_mm(runlist_element *base, int dst, int src, - int size) -{ - if (likely((dst != src) && (size > 0))) - memmove(base + dst, base + src, size * sizeof(*base)); -} - -/** - * ntfs_rl_mc - runlist memory copy - * - * It is up to the caller to serialize access to the runlists @dstbase and - * @srcbase. - */ -static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, - runlist_element *srcbase, int src, int size) -{ - if (likely(size > 0)) - memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase)); -} - -/** - * ntfs_rl_realloc - Reallocate memory for runlists - * @rl: original runlist - * @old_size: number of runlist elements in the original runlist @rl - * @new_size: number of runlist elements we need space for - * - * As the runlists grow, more memory will be required. To prevent the - * kernel having to allocate and reallocate large numbers of small bits of - * memory, this function returns an entire page of memory. - * - * It is up to the caller to serialize access to the runlist @rl. - * - * N.B. If the new allocation doesn't require a different number of pages in - * memory, the function will return the original pointer. - * - * On success, return a pointer to the newly allocated, or recycled, memory. - * On error, return -errno. The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_realloc(runlist_element *rl, - int old_size, int new_size) -{ - runlist_element *new_rl; - - old_size = PAGE_ALIGN(old_size * sizeof(*rl)); - new_size = PAGE_ALIGN(new_size * sizeof(*rl)); - if (old_size == new_size) - return rl; - - new_rl = ntfs_malloc_nofs(new_size); - if (unlikely(!new_rl)) - return ERR_PTR(-ENOMEM); - - if (likely(rl != NULL)) { - if (unlikely(old_size > new_size)) - old_size = new_size; - memcpy(new_rl, rl, old_size); - ntfs_free(rl); - } - return new_rl; -} - -/** - * ntfs_rl_realloc_nofail - Reallocate memory for runlists - * @rl: original runlist - * @old_size: number of runlist elements in the original runlist @rl - * @new_size: number of runlist elements we need space for - * - * As the runlists grow, more memory will be required. To prevent the - * kernel having to allocate and reallocate large numbers of small bits of - * memory, this function returns an entire page of memory. - * - * This function guarantees that the allocation will succeed. It will sleep - * for as long as it takes to complete the allocation. - * - * It is up to the caller to serialize access to the runlist @rl. - * - * N.B. If the new allocation doesn't require a different number of pages in - * memory, the function will return the original pointer. - * - * On success, return a pointer to the newly allocated, or recycled, memory. - * On error, return -errno. The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, - int old_size, int new_size) -{ - runlist_element *new_rl; - - old_size = PAGE_ALIGN(old_size * sizeof(*rl)); - new_size = PAGE_ALIGN(new_size * sizeof(*rl)); - if (old_size == new_size) - return rl; - - new_rl = ntfs_malloc_nofs_nofail(new_size); - BUG_ON(!new_rl); - - if (likely(rl != NULL)) { - if (unlikely(old_size > new_size)) - old_size = new_size; - memcpy(new_rl, rl, old_size); - ntfs_free(rl); - } - return new_rl; -} - -/** - * ntfs_are_rl_mergeable - test if two runlists can be joined together - * @dst: original runlist - * @src: new runlist to test for mergeability with @dst - * - * Test if two runlists can be joined together. For this, their VCNs and LCNs - * must be adjacent. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * Return: true Success, the runlists can be merged. - * false Failure, the runlists cannot be merged. - */ -static inline bool ntfs_are_rl_mergeable(runlist_element *dst, - runlist_element *src) -{ - BUG_ON(!dst); - BUG_ON(!src); - - /* We can merge unmapped regions even if they are misaligned. */ - if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED)) - return true; - /* If the runs are misaligned, we cannot merge them. */ - if ((dst->vcn + dst->length) != src->vcn) - return false; - /* If both runs are non-sparse and contiguous, we can merge them. */ - if ((dst->lcn >= 0) && (src->lcn >= 0) && - ((dst->lcn + dst->length) == src->lcn)) - return true; - /* If we are merging two holes, we can merge them. */ - if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE)) - return true; - /* Cannot merge. */ - return false; -} - -/** - * __ntfs_rl_merge - merge two runlists without testing if they can be merged - * @dst: original, destination runlist - * @src: new runlist to merge with @dst - * - * Merge the two runlists, writing into the destination runlist @dst. The - * caller must make sure the runlists can be merged or this will corrupt the - * destination runlist. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - */ -static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src) -{ - dst->length += src->length; -} - -/** - * ntfs_rl_append - append a runlist after a given element - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: runlist to be inserted into @dst - * @ssize: number of elements in @src (excluding end marker) - * @loc: append the new runlist @src after this element in @dst - * - * Append the runlist @src after element @loc in @dst. Merge the right end of - * the new runlist, if necessary. Adjust the size of the hole before the - * appended runlist. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_append(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) -{ - bool right = false; /* Right end of @src needs merging. */ - int marker; /* End of the inserted runs. */ - - BUG_ON(!dst); - BUG_ON(!src); - - /* First, check if the right hand end needs merging. */ - if ((loc + 1) < dsize) - right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); - - /* Space required: @dst size + @src size, less one if we merged. */ - dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right); - if (IS_ERR(dst)) - return dst; - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlists. - */ - - /* First, merge the right hand end, if necessary. */ - if (right) - __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); - - /* First run after the @src runs that have been inserted. */ - marker = loc + ssize + 1; - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right)); - ntfs_rl_mc(dst, loc + 1, src, 0, ssize); - - /* Adjust the size of the preceding hole. */ - dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; - - /* We may have changed the length of the file, so fix the end marker */ - if (dst[marker].lcn == LCN_ENOENT) - dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; - - return dst; -} - -/** - * ntfs_rl_insert - insert a runlist into another - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: insert the new runlist @src before this element in @dst - * - * Insert the runlist @src before element @loc in the runlist @dst. Merge the - * left end of the new runlist, if necessary. Adjust the size of the hole - * after the inserted runlist. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_insert(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) -{ - bool left = false; /* Left end of @src needs merging. */ - bool disc = false; /* Discontinuity between @dst and @src. */ - int marker; /* End of the inserted runs. */ - - BUG_ON(!dst); - BUG_ON(!src); - - /* - * disc => Discontinuity between the end of @dst and the start of @src. - * This means we might need to insert a "not mapped" run. - */ - if (loc == 0) - disc = (src[0].vcn > 0); - else { - s64 merged_length; - - left = ntfs_are_rl_mergeable(dst + loc - 1, src); - - merged_length = dst[loc - 1].length; - if (left) - merged_length += src->length; - - disc = (src[0].vcn > dst[loc - 1].vcn + merged_length); - } - /* - * Space required: @dst size + @src size, less one if we merged, plus - * one if there was a discontinuity. - */ - dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc); - if (IS_ERR(dst)) - return dst; - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlist. - */ - if (left) - __ntfs_rl_merge(dst + loc - 1, src); - /* - * First run after the @src runs that have been inserted. - * Nominally, @marker equals @loc + @ssize, i.e. location + number of - * runs in @src. However, if @left, then the first run in @src has - * been merged with one in @dst. And if @disc, then @dst and @src do - * not meet and we need an extra run to fill the gap. - */ - marker = loc + ssize - left + disc; - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, marker, loc, dsize - loc); - ntfs_rl_mc(dst, loc + disc, src, left, ssize - left); - - /* Adjust the VCN of the first run after the insertion... */ - dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; - /* ... and the length. */ - if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED) - dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn; - - /* Writing beyond the end of the file and there is a discontinuity. */ - if (disc) { - if (loc > 0) { - dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length; - dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; - } else { - dst[loc].vcn = 0; - dst[loc].length = dst[loc + 1].vcn; - } - dst[loc].lcn = LCN_RL_NOT_MAPPED; - } - return dst; -} - -/** - * ntfs_rl_replace - overwrite a runlist element with another runlist - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: index in runlist @dst to overwrite with @src - * - * Replace the runlist element @dst at @loc with @src. Merge the left and - * right ends of the inserted runlist, if necessary. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_replace(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) -{ - signed delta; - bool left = false; /* Left end of @src needs merging. */ - bool right = false; /* Right end of @src needs merging. */ - int tail; /* Start of tail of @dst. */ - int marker; /* End of the inserted runs. */ - - BUG_ON(!dst); - BUG_ON(!src); - - /* First, see if the left and right ends need merging. */ - if ((loc + 1) < dsize) - right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); - if (loc > 0) - left = ntfs_are_rl_mergeable(dst + loc - 1, src); - /* - * Allocate some space. We will need less if the left, right, or both - * ends get merged. The -1 accounts for the run being replaced. - */ - delta = ssize - 1 - left - right; - if (delta > 0) { - dst = ntfs_rl_realloc(dst, dsize, dsize + delta); - if (IS_ERR(dst)) - return dst; - } - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlists. - */ - - /* First, merge the left and right ends, if necessary. */ - if (right) - __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); - if (left) - __ntfs_rl_merge(dst + loc - 1, src); - /* - * Offset of the tail of @dst. This needs to be moved out of the way - * to make space for the runs to be copied from @src, i.e. the first - * run of the tail of @dst. - * Nominally, @tail equals @loc + 1, i.e. location, skipping the - * replaced run. However, if @right, then one of @dst's runs is - * already merged into @src. - */ - tail = loc + right + 1; - /* - * First run after the @src runs that have been inserted, i.e. where - * the tail of @dst needs to be moved to. - * Nominally, @marker equals @loc + @ssize, i.e. location + number of - * runs in @src. However, if @left, then the first run in @src has - * been merged with one in @dst. - */ - marker = loc + ssize - left; - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, marker, tail, dsize - tail); - ntfs_rl_mc(dst, loc, src, left, ssize - left); - - /* We may have changed the length of the file, so fix the end marker. */ - if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT) - dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; - return dst; -} - -/** - * ntfs_rl_split - insert a runlist into the centre of a hole - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: index in runlist @dst at which to split and insert @src - * - * Split the runlist @dst at @loc into two and insert @new in between the two - * fragments. No merging of runlists is necessary. Adjust the size of the - * holes either side. - * - * It is up to the caller to serialize access to the runlists @dst and @src. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @dst and @src are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - */ -static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, - runlist_element *src, int ssize, int loc) -{ - BUG_ON(!dst); - BUG_ON(!src); - - /* Space required: @dst size + @src size + one new hole. */ - dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1); - if (IS_ERR(dst)) - return dst; - /* - * We are guaranteed to succeed from here so can start modifying the - * original runlists. - */ - - /* Move the tail of @dst out of the way, then copy in @src. */ - ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc); - ntfs_rl_mc(dst, loc + 1, src, 0, ssize); - - /* Adjust the size of the holes either size of @src. */ - dst[loc].length = dst[loc+1].vcn - dst[loc].vcn; - dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length; - dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn; - - return dst; -} - -/** - * ntfs_runlists_merge - merge two runlists into one - * @drl: original runlist to be worked on - * @srl: new runlist to be merged into @drl - * - * First we sanity check the two runlists @srl and @drl to make sure that they - * are sensible and can be merged. The runlist @srl must be either after the - * runlist @drl or completely within a hole (or unmapped region) in @drl. - * - * It is up to the caller to serialize access to the runlists @drl and @srl. - * - * Merging of runlists is necessary in two cases: - * 1. When attribute lists are used and a further extent is being mapped. - * 2. When new clusters are allocated to fill a hole or extend a file. - * - * There are four possible ways @srl can be merged. It can: - * - be inserted at the beginning of a hole, - * - split the hole in two and be inserted between the two fragments, - * - be appended at the end of a hole, or it can - * - replace the whole hole. - * It can also be appended to the end of the runlist, which is just a variant - * of the insert case. - * - * On success, return a pointer to the new, combined, runlist. Note, both - * runlists @drl and @srl are deallocated before returning so you cannot use - * the pointers for anything any more. (Strictly speaking the returned runlist - * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - * -ERANGE - The runlists overlap and cannot be merged. - */ -runlist_element *ntfs_runlists_merge(runlist_element *drl, - runlist_element *srl) -{ - int di, si; /* Current index into @[ds]rl. */ - int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */ - int dins; /* Index into @drl at which to insert @srl. */ - int dend, send; /* Last index into @[ds]rl. */ - int dfinal, sfinal; /* The last index into @[ds]rl with - lcn >= LCN_HOLE. */ - int marker = 0; - VCN marker_vcn = 0; - -#ifdef DEBUG - ntfs_debug("dst:"); - ntfs_debug_dump_runlist(drl); - ntfs_debug("src:"); - ntfs_debug_dump_runlist(srl); -#endif - - /* Check for silly calling... */ - if (unlikely(!srl)) - return drl; - if (IS_ERR(srl) || IS_ERR(drl)) - return ERR_PTR(-EINVAL); - - /* Check for the case where the first mapping is being done now. */ - if (unlikely(!drl)) { - drl = srl; - /* Complete the source runlist if necessary. */ - if (unlikely(drl[0].vcn)) { - /* Scan to the end of the source runlist. */ - for (dend = 0; likely(drl[dend].length); dend++) - ; - dend++; - drl = ntfs_rl_realloc(drl, dend, dend + 1); - if (IS_ERR(drl)) - return drl; - /* Insert start element at the front of the runlist. */ - ntfs_rl_mm(drl, 1, 0, dend); - drl[0].vcn = 0; - drl[0].lcn = LCN_RL_NOT_MAPPED; - drl[0].length = drl[1].vcn; - } - goto finished; - } - - si = di = 0; - - /* Skip any unmapped start element(s) in the source runlist. */ - while (srl[si].length && srl[si].lcn < LCN_HOLE) - si++; - - /* Can't have an entirely unmapped source runlist. */ - BUG_ON(!srl[si].length); - - /* Record the starting points. */ - sstart = si; - - /* - * Skip forward in @drl until we reach the position where @srl needs to - * be inserted. If we reach the end of @drl, @srl just needs to be - * appended to @drl. - */ - for (; drl[di].length; di++) { - if (drl[di].vcn + drl[di].length > srl[sstart].vcn) - break; - } - dins = di; - - /* Sanity check for illegal overlaps. */ - if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) && - (srl[si].lcn >= 0)) { - ntfs_error(NULL, "Run lists overlap. Cannot merge!"); - return ERR_PTR(-ERANGE); - } - - /* Scan to the end of both runlists in order to know their sizes. */ - for (send = si; srl[send].length; send++) - ; - for (dend = di; drl[dend].length; dend++) - ; - - if (srl[send].lcn == LCN_ENOENT) - marker_vcn = srl[marker = send].vcn; - - /* Scan to the last element with lcn >= LCN_HOLE. */ - for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--) - ; - for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--) - ; - - { - bool start; - bool finish; - int ds = dend + 1; /* Number of elements in drl & srl */ - int ss = sfinal - sstart + 1; - - start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */ - (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */ - finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */ - ((drl[dins].vcn + drl[dins].length) <= /* End of hole */ - (srl[send - 1].vcn + srl[send - 1].length))); - - /* Or we will lose an end marker. */ - if (finish && !drl[dins].length) - ss++; - if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn)) - finish = false; -#if 0 - ntfs_debug("dfinal = %i, dend = %i", dfinal, dend); - ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send); - ntfs_debug("start = %i, finish = %i", start, finish); - ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins); -#endif - if (start) { - if (finish) - drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins); - else - drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins); - } else { - if (finish) - drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins); - else - drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins); - } - if (IS_ERR(drl)) { - ntfs_error(NULL, "Merge failed."); - return drl; - } - ntfs_free(srl); - if (marker) { - ntfs_debug("Triggering marker code."); - for (ds = dend; drl[ds].length; ds++) - ; - /* We only need to care if @srl ended after @drl. */ - if (drl[ds].vcn <= marker_vcn) { - int slots = 0; - - if (drl[ds].vcn == marker_vcn) { - ntfs_debug("Old marker = 0x%llx, replacing " - "with LCN_ENOENT.", - (unsigned long long) - drl[ds].lcn); - drl[ds].lcn = LCN_ENOENT; - goto finished; - } - /* - * We need to create an unmapped runlist element in - * @drl or extend an existing one before adding the - * ENOENT terminator. - */ - if (drl[ds].lcn == LCN_ENOENT) { - ds--; - slots = 1; - } - if (drl[ds].lcn != LCN_RL_NOT_MAPPED) { - /* Add an unmapped runlist element. */ - if (!slots) { - drl = ntfs_rl_realloc_nofail(drl, ds, - ds + 2); - slots = 2; - } - ds++; - /* Need to set vcn if it isn't set already. */ - if (slots != 1) - drl[ds].vcn = drl[ds - 1].vcn + - drl[ds - 1].length; - drl[ds].lcn = LCN_RL_NOT_MAPPED; - /* We now used up a slot. */ - slots--; - } - drl[ds].length = marker_vcn - drl[ds].vcn; - /* Finally add the ENOENT terminator. */ - ds++; - if (!slots) - drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1); - drl[ds].vcn = marker_vcn; - drl[ds].lcn = LCN_ENOENT; - drl[ds].length = (s64)0; - } - } - } - -finished: - /* The merge was completed successfully. */ - ntfs_debug("Merged runlist:"); - ntfs_debug_dump_runlist(drl); - return drl; -} - -/** - * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist - * @vol: ntfs volume on which the attribute resides - * @attr: attribute record whose mapping pairs array to decompress - * @old_rl: optional runlist in which to insert @attr's runlist - * - * It is up to the caller to serialize access to the runlist @old_rl. - * - * Decompress the attribute @attr's mapping pairs array into a runlist. On - * success, return the decompressed runlist. - * - * If @old_rl is not NULL, decompressed runlist is inserted into the - * appropriate place in @old_rl and the resultant, combined runlist is - * returned. The original @old_rl is deallocated. - * - * On error, return -errno. @old_rl is left unmodified in that case. - * - * The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EIO - Corrupt runlist. - * -EINVAL - Invalid parameters were passed in. - * -ERANGE - The two runlists overlap. - * - * FIXME: For now we take the conceptionally simplest approach of creating the - * new runlist disregarding the already existing one and then splicing the - * two into one, if that is possible (we check for overlap and discard the new - * runlist if overlap present before returning ERR_PTR(-ERANGE)). - */ -runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, - const ATTR_RECORD *attr, runlist_element *old_rl) -{ - VCN vcn; /* Current vcn. */ - LCN lcn; /* Current lcn. */ - s64 deltaxcn; /* Change in [vl]cn. */ - runlist_element *rl; /* The output runlist. */ - u8 *buf; /* Current position in mapping pairs array. */ - u8 *attr_end; /* End of attribute. */ - int rlsize; /* Size of runlist buffer. */ - u16 rlpos; /* Current runlist position in units of - runlist_elements. */ - u8 b; /* Current byte offset in buf. */ - -#ifdef DEBUG - /* Make sure attr exists and is non-resident. */ - if (!attr || !attr->non_resident || sle64_to_cpu( - attr->data.non_resident.lowest_vcn) < (VCN)0) { - ntfs_error(vol->sb, "Invalid arguments."); - return ERR_PTR(-EINVAL); - } -#endif - /* Start at vcn = lowest_vcn and lcn 0. */ - vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn); - lcn = 0; - /* Get start of the mapping pairs array. */ - buf = (u8*)attr + le16_to_cpu( - attr->data.non_resident.mapping_pairs_offset); - attr_end = (u8*)attr + le32_to_cpu(attr->length); - if (unlikely(buf < (u8*)attr || buf > attr_end)) { - ntfs_error(vol->sb, "Corrupt attribute."); - return ERR_PTR(-EIO); - } - /* If the mapping pairs array is valid but empty, nothing to do. */ - if (!vcn && !*buf) - return old_rl; - /* Current position in runlist array. */ - rlpos = 0; - /* Allocate first page and set current runlist size to one page. */ - rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE); - if (unlikely(!rl)) - return ERR_PTR(-ENOMEM); - /* Insert unmapped starting element if necessary. */ - if (vcn) { - rl->vcn = 0; - rl->lcn = LCN_RL_NOT_MAPPED; - rl->length = vcn; - rlpos++; - } - while (buf < attr_end && *buf) { - /* - * Allocate more memory if needed, including space for the - * not-mapped and terminator elements. ntfs_malloc_nofs() - * operates on whole pages only. - */ - if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) { - runlist_element *rl2; - - rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); - if (unlikely(!rl2)) { - ntfs_free(rl); - return ERR_PTR(-ENOMEM); - } - memcpy(rl2, rl, rlsize); - ntfs_free(rl); - rl = rl2; - rlsize += PAGE_SIZE; - } - /* Enter the current vcn into the current runlist element. */ - rl[rlpos].vcn = vcn; - /* - * Get the change in vcn, i.e. the run length in clusters. - * Doing it this way ensures that we signextend negative values. - * A negative run length doesn't make any sense, but hey, I - * didn't make up the NTFS specs and Windows NT4 treats the run - * length as a signed value so that's how it is... - */ - b = *buf & 0xf; - if (b) { - if (unlikely(buf + b > attr_end)) - goto io_error; - for (deltaxcn = (s8)buf[b--]; b; b--) - deltaxcn = (deltaxcn << 8) + buf[b]; - } else { /* The length entry is compulsory. */ - ntfs_error(vol->sb, "Missing length entry in mapping " - "pairs array."); - deltaxcn = (s64)-1; - } - /* - * Assume a negative length to indicate data corruption and - * hence clean-up and return NULL. - */ - if (unlikely(deltaxcn < 0)) { - ntfs_error(vol->sb, "Invalid length in mapping pairs " - "array."); - goto err_out; - } - /* - * Enter the current run length into the current runlist - * element. - */ - rl[rlpos].length = deltaxcn; - /* Increment the current vcn by the current run length. */ - vcn += deltaxcn; - /* - * There might be no lcn change at all, as is the case for - * sparse clusters on NTFS 3.0+, in which case we set the lcn - * to LCN_HOLE. - */ - if (!(*buf & 0xf0)) - rl[rlpos].lcn = LCN_HOLE; - else { - /* Get the lcn change which really can be negative. */ - u8 b2 = *buf & 0xf; - b = b2 + ((*buf >> 4) & 0xf); - if (buf + b > attr_end) - goto io_error; - for (deltaxcn = (s8)buf[b--]; b > b2; b--) - deltaxcn = (deltaxcn << 8) + buf[b]; - /* Change the current lcn to its new value. */ - lcn += deltaxcn; -#ifdef DEBUG - /* - * On NTFS 1.2-, apparently can have lcn == -1 to - * indicate a hole. But we haven't verified ourselves - * whether it is really the lcn or the deltaxcn that is - * -1. So if either is found give us a message so we - * can investigate it further! - */ - if (vol->major_ver < 3) { - if (unlikely(deltaxcn == (LCN)-1)) - ntfs_error(vol->sb, "lcn delta == -1"); - if (unlikely(lcn == (LCN)-1)) - ntfs_error(vol->sb, "lcn == -1"); - } -#endif - /* Check lcn is not below -1. */ - if (unlikely(lcn < (LCN)-1)) { - ntfs_error(vol->sb, "Invalid LCN < -1 in " - "mapping pairs array."); - goto err_out; - } - /* Enter the current lcn into the runlist element. */ - rl[rlpos].lcn = lcn; - } - /* Get to the next runlist element. */ - rlpos++; - /* Increment the buffer position to the next mapping pair. */ - buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1; - } - if (unlikely(buf >= attr_end)) - goto io_error; - /* - * If there is a highest_vcn specified, it must be equal to the final - * vcn in the runlist - 1, or something has gone badly wrong. - */ - deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn); - if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) { -mpa_err: - ntfs_error(vol->sb, "Corrupt mapping pairs array in " - "non-resident attribute."); - goto err_out; - } - /* Setup not mapped runlist element if this is the base extent. */ - if (!attr->data.non_resident.lowest_vcn) { - VCN max_cluster; - - max_cluster = ((sle64_to_cpu( - attr->data.non_resident.allocated_size) + - vol->cluster_size - 1) >> - vol->cluster_size_bits) - 1; - /* - * A highest_vcn of zero means this is a single extent - * attribute so simply terminate the runlist with LCN_ENOENT). - */ - if (deltaxcn) { - /* - * If there is a difference between the highest_vcn and - * the highest cluster, the runlist is either corrupt - * or, more likely, there are more extents following - * this one. - */ - if (deltaxcn < max_cluster) { - ntfs_debug("More extents to follow; deltaxcn " - "= 0x%llx, max_cluster = " - "0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long) - max_cluster); - rl[rlpos].vcn = vcn; - vcn += rl[rlpos].length = max_cluster - - deltaxcn; - rl[rlpos].lcn = LCN_RL_NOT_MAPPED; - rlpos++; - } else if (unlikely(deltaxcn > max_cluster)) { - ntfs_error(vol->sb, "Corrupt attribute. " - "deltaxcn = 0x%llx, " - "max_cluster = 0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long) - max_cluster); - goto mpa_err; - } - } - rl[rlpos].lcn = LCN_ENOENT; - } else /* Not the base extent. There may be more extents to follow. */ - rl[rlpos].lcn = LCN_RL_NOT_MAPPED; - - /* Setup terminating runlist element. */ - rl[rlpos].vcn = vcn; - rl[rlpos].length = (s64)0; - /* If no existing runlist was specified, we are done. */ - if (!old_rl) { - ntfs_debug("Mapping pairs array successfully decompressed:"); - ntfs_debug_dump_runlist(rl); - return rl; - } - /* Now combine the new and old runlists checking for overlaps. */ - old_rl = ntfs_runlists_merge(old_rl, rl); - if (!IS_ERR(old_rl)) - return old_rl; - ntfs_free(rl); - ntfs_error(vol->sb, "Failed to merge runlists."); - return old_rl; -io_error: - ntfs_error(vol->sb, "Corrupt attribute."); -err_out: - ntfs_free(rl); - return ERR_PTR(-EIO); -} - -/** - * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist - * @rl: runlist to use for conversion - * @vcn: vcn to convert - * - * Convert the virtual cluster number @vcn of an attribute into a logical - * cluster number (lcn) of a device using the runlist @rl to map vcns to their - * corresponding lcns. - * - * It is up to the caller to serialize access to the runlist @rl. - * - * Since lcns must be >= 0, we use negative return codes with special meaning: - * - * Return code Meaning / Description - * ================================================== - * LCN_HOLE Hole / not allocated on disk. - * LCN_RL_NOT_MAPPED This is part of the runlist which has not been - * inserted into the runlist yet. - * LCN_ENOENT There is no such vcn in the attribute. - * - * Locking: - The caller must have locked the runlist (for reading or writing). - * - This function does not touch the lock, nor does it modify the - * runlist. - */ -LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) -{ - int i; - - BUG_ON(vcn < 0); - /* - * If rl is NULL, assume that we have found an unmapped runlist. The - * caller can then attempt to map it and fail appropriately if - * necessary. - */ - if (unlikely(!rl)) - return LCN_RL_NOT_MAPPED; - - /* Catch out of lower bounds vcn. */ - if (unlikely(vcn < rl[0].vcn)) - return LCN_ENOENT; - - for (i = 0; likely(rl[i].length); i++) { - if (unlikely(vcn < rl[i+1].vcn)) { - if (likely(rl[i].lcn >= (LCN)0)) - return rl[i].lcn + (vcn - rl[i].vcn); - return rl[i].lcn; - } - } - /* - * The terminator element is setup to the correct value, i.e. one of - * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT. - */ - if (likely(rl[i].lcn < (LCN)0)) - return rl[i].lcn; - /* Just in case... We could replace this with BUG() some day. */ - return LCN_ENOENT; -} - -#ifdef NTFS_RW - -/** - * ntfs_rl_find_vcn_nolock - find a vcn in a runlist - * @rl: runlist to search - * @vcn: vcn to find - * - * Find the virtual cluster number @vcn in the runlist @rl and return the - * address of the runlist element containing the @vcn on success. - * - * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of - * the runlist. - * - * Locking: The runlist must be locked on entry. - */ -runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn) -{ - BUG_ON(vcn < 0); - if (unlikely(!rl || vcn < rl[0].vcn)) - return NULL; - while (likely(rl->length)) { - if (unlikely(vcn < rl[1].vcn)) { - if (likely(rl->lcn >= LCN_HOLE)) - return rl; - return NULL; - } - rl++; - } - if (likely(rl->lcn == LCN_ENOENT)) - return rl; - return NULL; -} - -/** - * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number - * @n: number for which to get the number of bytes for - * - * Return the number of bytes required to store @n unambiguously as - * a signed number. - * - * This is used in the context of the mapping pairs array to determine how - * many bytes will be needed in the array to store a given logical cluster - * number (lcn) or a specific run length. - * - * Return the number of bytes written. This function cannot fail. - */ -static inline int ntfs_get_nr_significant_bytes(const s64 n) -{ - s64 l = n; - int i; - s8 j; - - i = 0; - do { - l >>= 8; - i++; - } while (l != 0 && l != -1); - j = (n >> 8 * (i - 1)) & 0xff; - /* If the sign bit is wrong, we need an extra byte. */ - if ((n < 0 && j >= 0) || (n > 0 && j < 0)) - i++; - return i; -} - -/** - * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array - * @vol: ntfs volume (needed for the ntfs version) - * @rl: locked runlist to determine the size of the mapping pairs of - * @first_vcn: first vcn which to include in the mapping pairs array - * @last_vcn: last vcn which to include in the mapping pairs array - * - * Walk the locked runlist @rl and calculate the size in bytes of the mapping - * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and - * finishing with vcn @last_vcn. - * - * A @last_vcn of -1 means end of runlist and in that case the size of the - * mapping pairs array corresponding to the runlist starting at vcn @first_vcn - * and finishing at the end of the runlist is determined. - * - * This for example allows us to allocate a buffer of the right size when - * building the mapping pairs array. - * - * If @rl is NULL, just return 1 (for the single terminator byte). - * - * Return the calculated size in bytes on success. On error, return -errno. - * The following error codes are defined: - * -EINVAL - Run list contains unmapped elements. Make sure to only pass - * fully mapped runlists to this function. - * -EIO - The runlist is corrupt. - * - * Locking: @rl must be locked on entry (either for reading or writing), it - * remains locked throughout, and is left locked upon return. - */ -int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN first_vcn, - const VCN last_vcn) -{ - LCN prev_lcn; - int rls; - bool the_end = false; - - BUG_ON(first_vcn < 0); - BUG_ON(last_vcn < -1); - BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); - if (!rl) { - BUG_ON(first_vcn); - BUG_ON(last_vcn > 0); - return 1; - } - /* Skip to runlist element containing @first_vcn. */ - while (rl->length && first_vcn >= rl[1].vcn) - rl++; - if (unlikely((!rl->length && first_vcn > rl->vcn) || - first_vcn < rl->vcn)) - return -EINVAL; - prev_lcn = 0; - /* Always need the termining zero byte. */ - rls = 1; - /* Do the first partial run if present. */ - if (first_vcn > rl->vcn) { - s64 delta, length = rl->length; - - /* We know rl->length != 0 already. */ - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - delta = first_vcn - rl->vcn; - /* Header byte + length. */ - rls += 1 + ntfs_get_nr_significant_bytes(length - delta); - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just store the lcn. - * Note: this assumes that on NTFS 1.2-, holes are stored with - * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - prev_lcn = rl->lcn; - if (likely(rl->lcn >= 0)) - prev_lcn += delta; - /* Change in lcn. */ - rls += ntfs_get_nr_significant_bytes(prev_lcn); - } - /* Go to next runlist element. */ - rl++; - } - /* Do the full runs. */ - for (; rl->length && !the_end; rl++) { - s64 length = rl->length; - - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - /* Header byte + length. */ - rls += 1 + ntfs_get_nr_significant_bytes(length); - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just store the lcn. - * Note: this assumes that on NTFS 1.2-, holes are stored with - * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - /* Change in lcn. */ - rls += ntfs_get_nr_significant_bytes(rl->lcn - - prev_lcn); - prev_lcn = rl->lcn; - } - } - return rls; -err_out: - if (rl->lcn == LCN_RL_NOT_MAPPED) - rls = -EINVAL; - else - rls = -EIO; - return rls; -} - -/** - * ntfs_write_significant_bytes - write the significant bytes of a number - * @dst: destination buffer to write to - * @dst_max: pointer to last byte of destination buffer for bounds checking - * @n: number whose significant bytes to write - * - * Store in @dst, the minimum bytes of the number @n which are required to - * identify @n unambiguously as a signed number, taking care not to exceed - * @dest_max, the maximum position within @dst to which we are allowed to - * write. - * - * This is used when building the mapping pairs array of a runlist to compress - * a given logical cluster number (lcn) or a specific run length to the minimum - * size possible. - * - * Return the number of bytes written on success. On error, i.e. the - * destination buffer @dst is too small, return -ENOSPC. - */ -static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, - const s64 n) -{ - s64 l = n; - int i; - s8 j; - - i = 0; - do { - if (unlikely(dst > dst_max)) - goto err_out; - *dst++ = l & 0xffll; - l >>= 8; - i++; - } while (l != 0 && l != -1); - j = (n >> 8 * (i - 1)) & 0xff; - /* If the sign bit is wrong, we need an extra byte. */ - if (n < 0 && j >= 0) { - if (unlikely(dst > dst_max)) - goto err_out; - i++; - *dst = (s8)-1; - } else if (n > 0 && j < 0) { - if (unlikely(dst > dst_max)) - goto err_out; - i++; - *dst = (s8)0; - } - return i; -err_out: - return -ENOSPC; -} - -/** - * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist - * @vol: ntfs volume (needed for the ntfs version) - * @dst: destination buffer to which to write the mapping pairs array - * @dst_len: size of destination buffer @dst in bytes - * @rl: locked runlist for which to build the mapping pairs array - * @first_vcn: first vcn which to include in the mapping pairs array - * @last_vcn: last vcn which to include in the mapping pairs array - * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC - * - * Create the mapping pairs array from the locked runlist @rl, starting at vcn - * @first_vcn and finishing with vcn @last_vcn and save the array in @dst. - * @dst_len is the size of @dst in bytes and it should be at least equal to the - * value obtained by calling ntfs_get_size_for_mapping_pairs(). - * - * A @last_vcn of -1 means end of runlist and in that case the mapping pairs - * array corresponding to the runlist starting at vcn @first_vcn and finishing - * at the end of the runlist is created. - * - * If @rl is NULL, just write a single terminator byte to @dst. - * - * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to - * the first vcn outside the destination buffer. Note that on error, @dst has - * been filled with all the mapping pairs that will fit, thus it can be treated - * as partial success, in that a new attribute extent needs to be created or - * the next extent has to be used and the mapping pairs build has to be - * continued with @first_vcn set to *@stop_vcn. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -EINVAL - Run list contains unmapped elements. Make sure to only pass - * fully mapped runlists to this function. - * -EIO - The runlist is corrupt. - * -ENOSPC - The destination buffer is too small. - * - * Locking: @rl must be locked on entry (either for reading or writing), it - * remains locked throughout, and is left locked upon return. - */ -int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, - const int dst_len, const runlist_element *rl, - const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn) -{ - LCN prev_lcn; - s8 *dst_max, *dst_next; - int err = -ENOSPC; - bool the_end = false; - s8 len_len, lcn_len; - - BUG_ON(first_vcn < 0); - BUG_ON(last_vcn < -1); - BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); - BUG_ON(dst_len < 1); - if (!rl) { - BUG_ON(first_vcn); - BUG_ON(last_vcn > 0); - if (stop_vcn) - *stop_vcn = 0; - /* Terminator byte. */ - *dst = 0; - return 0; - } - /* Skip to runlist element containing @first_vcn. */ - while (rl->length && first_vcn >= rl[1].vcn) - rl++; - if (unlikely((!rl->length && first_vcn > rl->vcn) || - first_vcn < rl->vcn)) - return -EINVAL; - /* - * @dst_max is used for bounds checking in - * ntfs_write_significant_bytes(). - */ - dst_max = dst + dst_len - 1; - prev_lcn = 0; - /* Do the first partial run if present. */ - if (first_vcn > rl->vcn) { - s64 delta, length = rl->length; - - /* We know rl->length != 0 already. */ - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - delta = first_vcn - rl->vcn; - /* Write length. */ - len_len = ntfs_write_significant_bytes(dst + 1, dst_max, - length - delta); - if (unlikely(len_len < 0)) - goto size_err; - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just write the lcn - * change. FIXME: Do we need to write the lcn change or just - * the lcn in that case? Not sure as I have never seen this - * case on NT4. - We assume that we just need to write the lcn - * change until someone tells us otherwise... (AIA) - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - prev_lcn = rl->lcn; - if (likely(rl->lcn >= 0)) - prev_lcn += delta; - /* Write change in lcn. */ - lcn_len = ntfs_write_significant_bytes(dst + 1 + - len_len, dst_max, prev_lcn); - if (unlikely(lcn_len < 0)) - goto size_err; - } else - lcn_len = 0; - dst_next = dst + len_len + lcn_len + 1; - if (unlikely(dst_next > dst_max)) - goto size_err; - /* Update header byte. */ - *dst = lcn_len << 4 | len_len; - /* Position at next mapping pairs array element. */ - dst = dst_next; - /* Go to next runlist element. */ - rl++; - } - /* Do the full runs. */ - for (; rl->length && !the_end; rl++) { - s64 length = rl->length; - - if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) - goto err_out; - /* - * If @stop_vcn is given and finishes inside this run, cap the - * run length. - */ - if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { - s64 s1 = last_vcn + 1; - if (unlikely(rl[1].vcn > s1)) - length = s1 - rl->vcn; - the_end = true; - } - /* Write length. */ - len_len = ntfs_write_significant_bytes(dst + 1, dst_max, - length); - if (unlikely(len_len < 0)) - goto size_err; - /* - * If the logical cluster number (lcn) denotes a hole and we - * are on NTFS 3.0+, we don't store it at all, i.e. we need - * zero space. On earlier NTFS versions we just write the lcn - * change. FIXME: Do we need to write the lcn change or just - * the lcn in that case? Not sure as I have never seen this - * case on NT4. - We assume that we just need to write the lcn - * change until someone tells us otherwise... (AIA) - */ - if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { - /* Write change in lcn. */ - lcn_len = ntfs_write_significant_bytes(dst + 1 + - len_len, dst_max, rl->lcn - prev_lcn); - if (unlikely(lcn_len < 0)) - goto size_err; - prev_lcn = rl->lcn; - } else - lcn_len = 0; - dst_next = dst + len_len + lcn_len + 1; - if (unlikely(dst_next > dst_max)) - goto size_err; - /* Update header byte. */ - *dst = lcn_len << 4 | len_len; - /* Position at next mapping pairs array element. */ - dst = dst_next; - } - /* Success. */ - err = 0; -size_err: - /* Set stop vcn. */ - if (stop_vcn) - *stop_vcn = rl->vcn; - /* Add terminator byte. */ - *dst = 0; - return err; -err_out: - if (rl->lcn == LCN_RL_NOT_MAPPED) - err = -EINVAL; - else - err = -EIO; - return err; -} - -/** - * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn - * @vol: ntfs volume (needed for error output) - * @runlist: runlist to truncate - * @new_length: the new length of the runlist in VCNs - * - * Truncate the runlist described by @runlist as well as the memory buffer - * holding the runlist elements to a length of @new_length VCNs. - * - * If @new_length lies within the runlist, the runlist elements with VCNs of - * @new_length and above are discarded. As a special case if @new_length is - * zero, the runlist is discarded and set to NULL. - * - * If @new_length lies beyond the runlist, a sparse runlist element is added to - * the end of the runlist @runlist or if the last runlist element is a sparse - * one already, this is extended. - * - * Note, no checking is done for unmapped runlist elements. It is assumed that - * the caller has mapped any elements that need to be mapped already. - * - * Return 0 on success and -errno on error. - * - * Locking: The caller must hold @runlist->lock for writing. - */ -int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, - const s64 new_length) -{ - runlist_element *rl; - int old_size; - - ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length); - BUG_ON(!runlist); - BUG_ON(new_length < 0); - rl = runlist->rl; - if (!new_length) { - ntfs_debug("Freeing runlist."); - runlist->rl = NULL; - if (rl) - ntfs_free(rl); - return 0; - } - if (unlikely(!rl)) { - /* - * Create a runlist consisting of a sparse runlist element of - * length @new_length followed by a terminator runlist element. - */ - rl = ntfs_malloc_nofs(PAGE_SIZE); - if (unlikely(!rl)) { - ntfs_error(vol->sb, "Not enough memory to allocate " - "runlist element buffer."); - return -ENOMEM; - } - runlist->rl = rl; - rl[1].length = rl->vcn = 0; - rl->lcn = LCN_HOLE; - rl[1].vcn = rl->length = new_length; - rl[1].lcn = LCN_ENOENT; - return 0; - } - BUG_ON(new_length < rl->vcn); - /* Find @new_length in the runlist. */ - while (likely(rl->length && new_length >= rl[1].vcn)) - rl++; - /* - * If not at the end of the runlist we need to shrink it. - * If at the end of the runlist we need to expand it. - */ - if (rl->length) { - runlist_element *trl; - bool is_end; - - ntfs_debug("Shrinking runlist."); - /* Determine the runlist size. */ - trl = rl + 1; - while (likely(trl->length)) - trl++; - old_size = trl - runlist->rl + 1; - /* Truncate the run. */ - rl->length = new_length - rl->vcn; - /* - * If a run was partially truncated, make the following runlist - * element a terminator. - */ - is_end = false; - if (rl->length) { - rl++; - if (!rl->length) - is_end = true; - rl->vcn = new_length; - rl->length = 0; - } - rl->lcn = LCN_ENOENT; - /* Reallocate memory if necessary. */ - if (!is_end) { - int new_size = rl - runlist->rl + 1; - rl = ntfs_rl_realloc(runlist->rl, old_size, new_size); - if (IS_ERR(rl)) - ntfs_warning(vol->sb, "Failed to shrink " - "runlist buffer. This just " - "wastes a bit of memory " - "temporarily so we ignore it " - "and return success."); - else - runlist->rl = rl; - } - } else if (likely(/* !rl->length && */ new_length > rl->vcn)) { - ntfs_debug("Expanding runlist."); - /* - * If there is a previous runlist element and it is a sparse - * one, extend it. Otherwise need to add a new, sparse runlist - * element. - */ - if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE)) - (rl - 1)->length = new_length - (rl - 1)->vcn; - else { - /* Determine the runlist size. */ - old_size = rl - runlist->rl + 1; - /* Reallocate memory if necessary. */ - rl = ntfs_rl_realloc(runlist->rl, old_size, - old_size + 1); - if (IS_ERR(rl)) { - ntfs_error(vol->sb, "Failed to expand runlist " - "buffer, aborting."); - return PTR_ERR(rl); - } - runlist->rl = rl; - /* - * Set @rl to the same runlist element in the new - * runlist as before in the old runlist. - */ - rl += old_size - 1; - /* Add a new, sparse runlist element. */ - rl->lcn = LCN_HOLE; - rl->length = new_length - rl->vcn; - /* Add a new terminator runlist element. */ - rl++; - rl->length = 0; - } - rl->vcn = new_length; - rl->lcn = LCN_ENOENT; - } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ { - /* Runlist already has same size as requested. */ - rl->lcn = LCN_ENOENT; - } - ntfs_debug("Done."); - return 0; -} - -/** - * ntfs_rl_punch_nolock - punch a hole into a runlist - * @vol: ntfs volume (needed for error output) - * @runlist: runlist to punch a hole into - * @start: starting VCN of the hole to be created - * @length: size of the hole to be created in units of clusters - * - * Punch a hole into the runlist @runlist starting at VCN @start and of size - * @length clusters. - * - * Return 0 on success and -errno on error, in which case @runlist has not been - * modified. - * - * If @start and/or @start + @length are outside the runlist return error code - * -ENOENT. - * - * If the runlist contains unmapped or error elements between @start and @start - * + @length return error code -EINVAL. - * - * Locking: The caller must hold @runlist->lock for writing. - */ -int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, - const VCN start, const s64 length) -{ - const VCN end = start + length; - s64 delta; - runlist_element *rl, *rl_end, *rl_real_end, *trl; - int old_size; - bool lcn_fixup = false; - - ntfs_debug("Entering for start 0x%llx, length 0x%llx.", - (long long)start, (long long)length); - BUG_ON(!runlist); - BUG_ON(start < 0); - BUG_ON(length < 0); - BUG_ON(end < 0); - rl = runlist->rl; - if (unlikely(!rl)) { - if (likely(!start && !length)) - return 0; - return -ENOENT; - } - /* Find @start in the runlist. */ - while (likely(rl->length && start >= rl[1].vcn)) - rl++; - rl_end = rl; - /* Find @end in the runlist. */ - while (likely(rl_end->length && end >= rl_end[1].vcn)) { - /* Verify there are no unmapped or error elements. */ - if (unlikely(rl_end->lcn < LCN_HOLE)) - return -EINVAL; - rl_end++; - } - /* Check the last element. */ - if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE)) - return -EINVAL; - /* This covers @start being out of bounds, too. */ - if (!rl_end->length && end > rl_end->vcn) - return -ENOENT; - if (!length) - return 0; - if (!rl->length) - return -ENOENT; - rl_real_end = rl_end; - /* Determine the runlist size. */ - while (likely(rl_real_end->length)) - rl_real_end++; - old_size = rl_real_end - runlist->rl + 1; - /* If @start is in a hole simply extend the hole. */ - if (rl->lcn == LCN_HOLE) { - /* - * If both @start and @end are in the same sparse run, we are - * done. - */ - if (end <= rl[1].vcn) { - ntfs_debug("Done (requested hole is already sparse)."); - return 0; - } -extend_hole: - /* Extend the hole. */ - rl->length = end - rl->vcn; - /* If @end is in a hole, merge it with the current one. */ - if (rl_end->lcn == LCN_HOLE) { - rl_end++; - rl->length = rl_end->vcn - rl->vcn; - } - /* We have done the hole. Now deal with the remaining tail. */ - rl++; - /* Cut out all runlist elements up to @end. */ - if (rl < rl_end) - memmove(rl, rl_end, (rl_real_end - rl_end + 1) * - sizeof(*rl)); - /* Adjust the beginning of the tail if necessary. */ - if (end > rl->vcn) { - delta = end - rl->vcn; - rl->vcn = end; - rl->length -= delta; - /* Only adjust the lcn if it is real. */ - if (rl->lcn >= 0) - rl->lcn += delta; - } -shrink_allocation: - /* Reallocate memory if the allocation changed. */ - if (rl < rl_end) { - rl = ntfs_rl_realloc(runlist->rl, old_size, - old_size - (rl_end - rl)); - if (IS_ERR(rl)) - ntfs_warning(vol->sb, "Failed to shrink " - "runlist buffer. This just " - "wastes a bit of memory " - "temporarily so we ignore it " - "and return success."); - else - runlist->rl = rl; - } - ntfs_debug("Done (extend hole)."); - return 0; - } - /* - * If @start is at the beginning of a run things are easier as there is - * no need to split the first run. - */ - if (start == rl->vcn) { - /* - * @start is at the beginning of a run. - * - * If the previous run is sparse, extend its hole. - * - * If @end is not in the same run, switch the run to be sparse - * and extend the newly created hole. - * - * Thus both of these cases reduce the problem to the above - * case of "@start is in a hole". - */ - if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) { - rl--; - goto extend_hole; - } - if (end >= rl[1].vcn) { - rl->lcn = LCN_HOLE; - goto extend_hole; - } - /* - * The final case is when @end is in the same run as @start. - * For this need to split the run into two. One run for the - * sparse region between the beginning of the old run, i.e. - * @start, and @end and one for the remaining non-sparse - * region, i.e. between @end and the end of the old run. - */ - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); - if (IS_ERR(trl)) - goto enomem_out; - old_size++; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } -split_end: - /* Shift all the runs up by one. */ - memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl)); - /* Finally, setup the two split runs. */ - rl->lcn = LCN_HOLE; - rl->length = length; - rl++; - rl->vcn += length; - /* Only adjust the lcn if it is real. */ - if (rl->lcn >= 0 || lcn_fixup) - rl->lcn += length; - rl->length -= length; - ntfs_debug("Done (split one)."); - return 0; - } - /* - * @start is neither in a hole nor at the beginning of a run. - * - * If @end is in a hole, things are easier as simply truncating the run - * @start is in to end at @start - 1, deleting all runs after that up - * to @end, and finally extending the beginning of the run @end is in - * to be @start is all that is needed. - */ - if (rl_end->lcn == LCN_HOLE) { - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - /* Cut out all runlist elements up to @end. */ - if (rl < rl_end) - memmove(rl, rl_end, (rl_real_end - rl_end + 1) * - sizeof(*rl)); - /* Extend the beginning of the run @end is in to be @start. */ - rl->vcn = start; - rl->length = rl[1].vcn - start; - goto shrink_allocation; - } - /* - * If @end is not in a hole there are still two cases to distinguish. - * Either @end is or is not in the same run as @start. - * - * The second case is easier as it can be reduced to an already solved - * problem by truncating the run @start is in to end at @start - 1. - * Then, if @end is in the next run need to split the run into a sparse - * run followed by a non-sparse run (already covered above) and if @end - * is not in the next run switching it to be sparse, again reduces the - * problem to the already covered case of "@start is in a hole". - */ - if (end >= rl[1].vcn) { - /* - * If @end is not in the next run, reduce the problem to the - * case of "@start is in a hole". - */ - if (rl[1].length && end >= rl[2].vcn) { - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - rl->vcn = start; - rl->lcn = LCN_HOLE; - goto extend_hole; - } - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); - if (IS_ERR(trl)) - goto enomem_out; - old_size++; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - /* - * @end is in the next run, reduce the problem to the case - * where "@start is at the beginning of a run and @end is in - * the same run as @start". - */ - delta = rl->vcn - start; - rl->vcn = start; - if (rl->lcn >= 0) { - rl->lcn -= delta; - /* Need this in case the lcn just became negative. */ - lcn_fixup = true; - } - rl->length += delta; - goto split_end; - } - /* - * The first case from above, i.e. @end is in the same run as @start. - * We need to split the run into three. One run for the non-sparse - * region between the beginning of the old run and @start, one for the - * sparse region between @start and @end, and one for the remaining - * non-sparse region, i.e. between @end and the end of the old run. - */ - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2); - if (IS_ERR(trl)) - goto enomem_out; - old_size += 2; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } - /* Shift all the runs up by two. */ - memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl)); - /* Finally, setup the three split runs. */ - rl->length = start - rl->vcn; - rl++; - rl->vcn = start; - rl->lcn = LCN_HOLE; - rl->length = length; - rl++; - delta = end - rl->vcn; - rl->vcn = end; - rl->lcn += delta; - rl->length -= delta; - ntfs_debug("Done (split both)."); - return 0; -enomem_out: - ntfs_error(vol->sb, "Not enough memory to extend runlist buffer."); - return -ENOMEM; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h deleted file mode 100644 index 38de0a375f59..000000000000 --- a/fs/ntfs/runlist.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * runlist.h - Defines for runlist handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_RUNLIST_H -#define _LINUX_NTFS_RUNLIST_H - -#include "types.h" -#include "layout.h" -#include "volume.h" - -/** - * runlist_element - in memory vcn to lcn mapping array element - * @vcn: starting vcn of the current array element - * @lcn: starting lcn of the current array element - * @length: length in clusters of the current array element - * - * The last vcn (in fact the last vcn + 1) is reached when length == 0. - * - * When lcn == -1 this means that the count vcns starting at vcn are not - * physically allocated (i.e. this is a hole / data is sparse). - */ -typedef struct { /* In memory vcn to lcn mapping structure element. */ - VCN vcn; /* vcn = Starting virtual cluster number. */ - LCN lcn; /* lcn = Starting logical cluster number. */ - s64 length; /* Run length in clusters. */ -} runlist_element; - -/** - * runlist - in memory vcn to lcn mapping array including a read/write lock - * @rl: pointer to an array of runlist elements - * @lock: read/write spinlock for serializing access to @rl - * - */ -typedef struct { - runlist_element *rl; - struct rw_semaphore lock; -} runlist; - -static inline void ntfs_init_runlist(runlist *rl) -{ - rl->rl = NULL; - init_rwsem(&rl->lock); -} - -typedef enum { - LCN_HOLE = -1, /* Keep this as highest value or die! */ - LCN_RL_NOT_MAPPED = -2, - LCN_ENOENT = -3, - LCN_ENOMEM = -4, - LCN_EIO = -5, -} LCN_SPECIAL_VALUES; - -extern runlist_element *ntfs_runlists_merge(runlist_element *drl, - runlist_element *srl); - -extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, - const ATTR_RECORD *attr, runlist_element *old_rl); - -extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn); - -#ifdef NTFS_RW - -extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, - const VCN vcn); - -extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN first_vcn, - const VCN last_vcn); - -extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, - const int dst_len, const runlist_element *rl, - const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn); - -extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol, - runlist *const runlist, const s64 new_length); - -int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, - const VCN start, const s64 length); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_RUNLIST_H */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c deleted file mode 100644 index 56a7d5bd33e4..000000000000 --- a/fs/ntfs/super.c +++ /dev/null @@ -1,3202 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. - * Copyright (c) 2001,2002 Richard Russon - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/stddef.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/spinlock.h> -#include <linux/blkdev.h> /* For bdev_logical_block_size(). */ -#include <linux/backing-dev.h> -#include <linux/buffer_head.h> -#include <linux/vfs.h> -#include <linux/moduleparam.h> -#include <linux/bitmap.h> - -#include "sysctl.h" -#include "logfile.h" -#include "quota.h" -#include "usnjrnl.h" -#include "dir.h" -#include "debug.h" -#include "index.h" -#include "inode.h" -#include "aops.h" -#include "layout.h" -#include "malloc.h" -#include "ntfs.h" - -/* Number of mounted filesystems which have compression enabled. */ -static unsigned long ntfs_nr_compression_users; - -/* A global default upcase table and a corresponding reference count. */ -static ntfschar *default_upcase; -static unsigned long ntfs_nr_upcase_users; - -/* Error constants/strings used in inode.c::ntfs_show_options(). */ -typedef enum { - /* One of these must be present, default is ON_ERRORS_CONTINUE. */ - ON_ERRORS_PANIC = 0x01, - ON_ERRORS_REMOUNT_RO = 0x02, - ON_ERRORS_CONTINUE = 0x04, - /* Optional, can be combined with any of the above. */ - ON_ERRORS_RECOVER = 0x10, -} ON_ERRORS_ACTIONS; - -const option_t on_errors_arr[] = { - { ON_ERRORS_PANIC, "panic" }, - { ON_ERRORS_REMOUNT_RO, "remount-ro", }, - { ON_ERRORS_CONTINUE, "continue", }, - { ON_ERRORS_RECOVER, "recover" }, - { 0, NULL } -}; - -/** - * simple_getbool - convert input string to a boolean value - * @s: input string to convert - * @setval: where to store the output boolean value - * - * Copied from old ntfs driver (which copied from vfat driver). - * - * "1", "yes", "true", or an empty string are converted to %true. - * "0", "no", and "false" are converted to %false. - * - * Return: %1 if the string is converted or was empty and *setval contains it; - * %0 if the string was not valid. - */ -static int simple_getbool(char *s, bool *setval) -{ - if (s) { - if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true")) - *setval = true; - else if (!strcmp(s, "0") || !strcmp(s, "no") || - !strcmp(s, "false")) - *setval = false; - else - return 0; - } else - *setval = true; - return 1; -} - -/** - * parse_options - parse the (re)mount options - * @vol: ntfs volume - * @opt: string containing the (re)mount options - * - * Parse the recognized options in @opt for the ntfs volume described by @vol. - */ -static bool parse_options(ntfs_volume *vol, char *opt) -{ - char *p, *v, *ov; - static char *utf8 = "utf8"; - int errors = 0, sloppy = 0; - kuid_t uid = INVALID_UID; - kgid_t gid = INVALID_GID; - umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; - int mft_zone_multiplier = -1, on_errors = -1; - int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; - struct nls_table *nls_map = NULL, *old_nls; - - /* I am lazy... (-8 */ -#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - variable = default_value; \ - else { \ - variable = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - } \ - } -#define NTFS_GETOPT(option, variable) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - goto needs_arg; \ - variable = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - } -#define NTFS_GETOPT_UID(option, variable) \ - if (!strcmp(p, option)) { \ - uid_t uid_value; \ - if (!v || !*v) \ - goto needs_arg; \ - uid_value = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - variable = make_kuid(current_user_ns(), uid_value); \ - if (!uid_valid(variable)) \ - goto needs_val; \ - } -#define NTFS_GETOPT_GID(option, variable) \ - if (!strcmp(p, option)) { \ - gid_t gid_value; \ - if (!v || !*v) \ - goto needs_arg; \ - gid_value = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - variable = make_kgid(current_user_ns(), gid_value); \ - if (!gid_valid(variable)) \ - goto needs_val; \ - } -#define NTFS_GETOPT_OCTAL(option, variable) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - goto needs_arg; \ - variable = simple_strtoul(ov = v, &v, 8); \ - if (*v) \ - goto needs_val; \ - } -#define NTFS_GETOPT_BOOL(option, variable) \ - if (!strcmp(p, option)) { \ - bool val; \ - if (!simple_getbool(v, &val)) \ - goto needs_bool; \ - variable = val; \ - } -#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \ - if (!strcmp(p, option)) { \ - int _i; \ - if (!v || !*v) \ - goto needs_arg; \ - ov = v; \ - if (variable == -1) \ - variable = 0; \ - for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \ - if (!strcmp(opt_array[_i].str, v)) { \ - variable |= opt_array[_i].val; \ - break; \ - } \ - if (!opt_array[_i].str || !*opt_array[_i].str) \ - goto needs_val; \ - } - if (!opt || !*opt) - goto no_mount_options; - ntfs_debug("Entering with mount options string: %s", opt); - while ((p = strsep(&opt, ","))) { - if ((v = strchr(p, '='))) - *v++ = 0; - NTFS_GETOPT_UID("uid", uid) - else NTFS_GETOPT_GID("gid", gid) - else NTFS_GETOPT_OCTAL("umask", fmask = dmask) - else NTFS_GETOPT_OCTAL("fmask", fmask) - else NTFS_GETOPT_OCTAL("dmask", dmask) - else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier) - else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true) - else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files) - else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive) - else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse) - else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors, - on_errors_arr) - else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes")) - ntfs_warning(vol->sb, "Ignoring obsolete option %s.", - p); - else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) { - if (!strcmp(p, "iocharset")) - ntfs_warning(vol->sb, "Option iocharset is " - "deprecated. Please use " - "option nls=<charsetname> in " - "the future."); - if (!v || !*v) - goto needs_arg; -use_utf8: - old_nls = nls_map; - nls_map = load_nls(v); - if (!nls_map) { - if (!old_nls) { - ntfs_error(vol->sb, "NLS character set " - "%s not found.", v); - return false; - } - ntfs_error(vol->sb, "NLS character set %s not " - "found. Using previous one %s.", - v, old_nls->charset); - nls_map = old_nls; - } else /* nls_map */ { - unload_nls(old_nls); - } - } else if (!strcmp(p, "utf8")) { - bool val = false; - ntfs_warning(vol->sb, "Option utf8 is no longer " - "supported, using option nls=utf8. Please " - "use option nls=utf8 in the future and " - "make sure utf8 is compiled either as a " - "module or into the kernel."); - if (!v || !*v) - val = true; - else if (!simple_getbool(v, &val)) - goto needs_bool; - if (val) { - v = utf8; - goto use_utf8; - } - } else { - ntfs_error(vol->sb, "Unrecognized mount option %s.", p); - if (errors < INT_MAX) - errors++; - } -#undef NTFS_GETOPT_OPTIONS_ARRAY -#undef NTFS_GETOPT_BOOL -#undef NTFS_GETOPT -#undef NTFS_GETOPT_WITH_DEFAULT - } -no_mount_options: - if (errors && !sloppy) - return false; - if (sloppy) - ntfs_warning(vol->sb, "Sloppy option given. Ignoring " - "unrecognized mount option(s) and continuing."); - /* Keep this first! */ - if (on_errors != -1) { - if (!on_errors) { - ntfs_error(vol->sb, "Invalid errors option argument " - "or bug in options parser."); - return false; - } - } - if (nls_map) { - if (vol->nls_map && vol->nls_map != nls_map) { - ntfs_error(vol->sb, "Cannot change NLS character set " - "on remount."); - return false; - } /* else (!vol->nls_map) */ - ntfs_debug("Using NLS character set %s.", nls_map->charset); - vol->nls_map = nls_map; - } else /* (!nls_map) */ { - if (!vol->nls_map) { - vol->nls_map = load_nls_default(); - if (!vol->nls_map) { - ntfs_error(vol->sb, "Failed to load default " - "NLS character set."); - return false; - } - ntfs_debug("Using default NLS character set (%s).", - vol->nls_map->charset); - } - } - if (mft_zone_multiplier != -1) { - if (vol->mft_zone_multiplier && vol->mft_zone_multiplier != - mft_zone_multiplier) { - ntfs_error(vol->sb, "Cannot change mft_zone_multiplier " - "on remount."); - return false; - } - if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) { - ntfs_error(vol->sb, "Invalid mft_zone_multiplier. " - "Using default value, i.e. 1."); - mft_zone_multiplier = 1; - } - vol->mft_zone_multiplier = mft_zone_multiplier; - } - if (!vol->mft_zone_multiplier) - vol->mft_zone_multiplier = 1; - if (on_errors != -1) - vol->on_errors = on_errors; - if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) - vol->on_errors |= ON_ERRORS_CONTINUE; - if (uid_valid(uid)) - vol->uid = uid; - if (gid_valid(gid)) - vol->gid = gid; - if (fmask != (umode_t)-1) - vol->fmask = fmask; - if (dmask != (umode_t)-1) - vol->dmask = dmask; - if (show_sys_files != -1) { - if (show_sys_files) - NVolSetShowSystemFiles(vol); - else - NVolClearShowSystemFiles(vol); - } - if (case_sensitive != -1) { - if (case_sensitive) - NVolSetCaseSensitive(vol); - else - NVolClearCaseSensitive(vol); - } - if (disable_sparse != -1) { - if (disable_sparse) - NVolClearSparseEnabled(vol); - else { - if (!NVolSparseEnabled(vol) && - vol->major_ver && vol->major_ver < 3) - ntfs_warning(vol->sb, "Not enabling sparse " - "support due to NTFS volume " - "version %i.%i (need at least " - "version 3.0).", vol->major_ver, - vol->minor_ver); - else - NVolSetSparseEnabled(vol); - } - } - return true; -needs_arg: - ntfs_error(vol->sb, "The %s option requires an argument.", p); - return false; -needs_bool: - ntfs_error(vol->sb, "The %s option requires a boolean argument.", p); - return false; -needs_val: - ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov); - return false; -} - -#ifdef NTFS_RW - -/** - * ntfs_write_volume_flags - write new flags to the volume information flags - * @vol: ntfs volume on which to modify the flags - * @flags: new flags value for the volume information flags - * - * Internal function. You probably want to use ntfs_{set,clear}_volume_flags() - * instead (see below). - * - * Replace the volume information flags on the volume @vol with the value - * supplied in @flags. Note, this overwrites the volume information flags, so - * make sure to combine the flags you want to modify with the old flags and use - * the result when calling ntfs_write_volume_flags(). - * - * Return 0 on success and -errno on error. - */ -static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) -{ - ntfs_inode *ni = NTFS_I(vol->vol_ino); - MFT_RECORD *m; - VOLUME_INFORMATION *vi; - ntfs_attr_search_ctx *ctx; - int err; - - ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.", - le16_to_cpu(vol->vol_flags), le16_to_cpu(flags)); - if (vol->vol_flags == flags) - goto done; - BUG_ON(!ni); - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); - if (!ctx) { - err = -ENOMEM; - goto put_unm_err_out; - } - err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, - ctx); - if (err) - goto put_unm_err_out; - vi = (VOLUME_INFORMATION*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - vol->vol_flags = vi->flags = flags; - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); -done: - ntfs_debug("Done."); - return 0; -put_unm_err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); -err_out: - ntfs_error(vol->sb, "Failed with error code %i.", -err); - return err; -} - -/** - * ntfs_set_volume_flags - set bits in the volume information flags - * @vol: ntfs volume on which to modify the flags - * @flags: flags to set on the volume - * - * Set the bits in @flags in the volume information flags on the volume @vol. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) -{ - flags &= VOLUME_FLAGS_MASK; - return ntfs_write_volume_flags(vol, vol->vol_flags | flags); -} - -/** - * ntfs_clear_volume_flags - clear bits in the volume information flags - * @vol: ntfs volume on which to modify the flags - * @flags: flags to clear on the volume - * - * Clear the bits in @flags in the volume information flags on the volume @vol. - * - * Return 0 on success and -errno on error. - */ -static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) -{ - flags &= VOLUME_FLAGS_MASK; - flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags)); - return ntfs_write_volume_flags(vol, flags); -} - -#endif /* NTFS_RW */ - -/** - * ntfs_remount - change the mount options of a mounted ntfs filesystem - * @sb: superblock of mounted ntfs filesystem - * @flags: remount flags - * @opt: remount options string - * - * Change the mount options of an already mounted ntfs filesystem. - * - * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after - * ntfs_remount() returns successfully (i.e. returns 0). Otherwise, - * @sb->s_flags are not changed. - */ -static int ntfs_remount(struct super_block *sb, int *flags, char *opt) -{ - ntfs_volume *vol = NTFS_SB(sb); - - ntfs_debug("Entering with remount options string: %s", opt); - - sync_filesystem(sb); - -#ifndef NTFS_RW - /* For read-only compiled driver, enforce read-only flag. */ - *flags |= SB_RDONLY; -#else /* NTFS_RW */ - /* - * For the read-write compiled driver, if we are remounting read-write, - * make sure there are no volume errors and that no unsupported volume - * flags are set. Also, empty the logfile journal as it would become - * stale as soon as something is written to the volume and mark the - * volume dirty so that chkdsk is run if the volume is not umounted - * cleanly. Finally, mark the quotas out of date so Windows rescans - * the volume on boot and updates them. - * - * When remounting read-only, mark the volume clean if no volume errors - * have occurred. - */ - if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { - static const char *es = ". Cannot remount read-write."; - - /* Remounting read-write. */ - if (NVolErrors(vol)) { - ntfs_error(sb, "Volume has errors and is read-only%s", - es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_IS_DIRTY) { - ntfs_error(sb, "Volume is dirty and read-only%s", es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { - ntfs_error(sb, "Volume has been modified by chkdsk " - "and is read-only%s", es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - ntfs_error(sb, "Volume has unsupported flags set " - "(0x%x) and is read-only%s", - (unsigned)le16_to_cpu(vol->vol_flags), - es); - return -EROFS; - } - if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { - ntfs_error(sb, "Failed to set dirty bit in volume " - "information flags%s", es); - return -EROFS; - } -#if 0 - // TODO: Enable this code once we start modifying anything that - // is different between NTFS 1.2 and 3.x... - /* Set NT4 compatibility flag on newer NTFS version volumes. */ - if ((vol->major_ver > 1)) { - if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { - ntfs_error(sb, "Failed to set NT4 " - "compatibility flag%s", es); - NVolSetErrors(vol); - return -EROFS; - } - } -#endif - if (!ntfs_empty_logfile(vol->logfile_ino)) { - ntfs_error(sb, "Failed to empty journal $LogFile%s", - es); - NVolSetErrors(vol); - return -EROFS; - } - if (!ntfs_mark_quotas_out_of_date(vol)) { - ntfs_error(sb, "Failed to mark quotas out of date%s", - es); - NVolSetErrors(vol); - return -EROFS; - } - if (!ntfs_stamp_usnjrnl(vol)) { - ntfs_error(sb, "Failed to stamp transaction log " - "($UsnJrnl)%s", es); - NVolSetErrors(vol); - return -EROFS; - } - } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { - /* Remounting read-only. */ - if (!NVolErrors(vol)) { - if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) - ntfs_warning(sb, "Failed to clear dirty bit " - "in volume information " - "flags. Run chkdsk."); - } - } -#endif /* NTFS_RW */ - - // TODO: Deal with *flags. - - if (!parse_options(vol, opt)) - return -EINVAL; - - ntfs_debug("Done."); - return 0; -} - -/** - * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector - * @sb: Super block of the device to which @b belongs. - * @b: Boot sector of device @sb to check. - * @silent: If 'true', all output will be silenced. - * - * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot - * sector. Returns 'true' if it is valid and 'false' if not. - * - * @sb is only needed for warning/error output, i.e. it can be NULL when silent - * is 'true'. - */ -static bool is_boot_sector_ntfs(const struct super_block *sb, - const NTFS_BOOT_SECTOR *b, const bool silent) -{ - /* - * Check that checksum == sum of u32 values from b to the checksum - * field. If checksum is zero, no checking is done. We will work when - * the checksum test fails, since some utilities update the boot sector - * ignoring the checksum which leaves the checksum out-of-date. We - * report a warning if this is the case. - */ - if ((void*)b < (void*)&b->checksum && b->checksum && !silent) { - le32 *u; - u32 i; - - for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u) - i += le32_to_cpup(u); - if (le32_to_cpu(b->checksum) != i) - ntfs_warning(sb, "Invalid boot sector checksum."); - } - /* Check OEMidentifier is "NTFS " */ - if (b->oem_id != magicNTFS) - goto not_ntfs; - /* Check bytes per sector value is between 256 and 4096. */ - if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 || - le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000) - goto not_ntfs; - /* Check sectors per cluster value is valid. */ - switch (b->bpb.sectors_per_cluster) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: - break; - default: - goto not_ntfs; - } - /* Check the cluster size is not above the maximum (64kiB). */ - if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) * - b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE) - goto not_ntfs; - /* Check reserved/unused fields are really zero. */ - if (le16_to_cpu(b->bpb.reserved_sectors) || - le16_to_cpu(b->bpb.root_entries) || - le16_to_cpu(b->bpb.sectors) || - le16_to_cpu(b->bpb.sectors_per_fat) || - le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats) - goto not_ntfs; - /* Check clusters per file mft record value is valid. */ - if ((u8)b->clusters_per_mft_record < 0xe1 || - (u8)b->clusters_per_mft_record > 0xf7) - switch (b->clusters_per_mft_record) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: - break; - default: - goto not_ntfs; - } - /* Check clusters per index block value is valid. */ - if ((u8)b->clusters_per_index_record < 0xe1 || - (u8)b->clusters_per_index_record > 0xf7) - switch (b->clusters_per_index_record) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: - break; - default: - goto not_ntfs; - } - /* - * Check for valid end of sector marker. We will work without it, but - * many BIOSes will refuse to boot from a bootsector if the magic is - * incorrect, so we emit a warning. - */ - if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) - ntfs_warning(sb, "Invalid end of sector marker."); - return true; -not_ntfs: - return false; -} - -/** - * read_ntfs_boot_sector - read the NTFS boot sector of a device - * @sb: super block of device to read the boot sector from - * @silent: if true, suppress all output - * - * Reads the boot sector from the device and validates it. If that fails, tries - * to read the backup boot sector, first from the end of the device a-la NT4 and - * later and then from the middle of the device a-la NT3.51 and before. - * - * If a valid boot sector is found but it is not the primary boot sector, we - * repair the primary boot sector silently (unless the device is read-only or - * the primary boot sector is not accessible). - * - * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super - * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized - * to their respective values. - * - * Return the unlocked buffer head containing the boot sector or NULL on error. - */ -static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, - const int silent) -{ - const char *read_err_str = "Unable to read %s boot sector."; - struct buffer_head *bh_primary, *bh_backup; - sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; - - /* Try to read primary boot sector. */ - if ((bh_primary = sb_bread(sb, 0))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_primary->b_data, silent)) - return bh_primary; - if (!silent) - ntfs_error(sb, "Primary boot sector is invalid."); - } else if (!silent) - ntfs_error(sb, read_err_str, "primary"); - if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) { - if (bh_primary) - brelse(bh_primary); - if (!silent) - ntfs_error(sb, "Mount option errors=recover not used. " - "Aborting without trying to recover."); - return NULL; - } - /* Try to read NT4+ backup boot sector. */ - if ((bh_backup = sb_bread(sb, nr_blocks - 1))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_backup->b_data, silent)) - goto hotfix_primary_boot_sector; - brelse(bh_backup); - } else if (!silent) - ntfs_error(sb, read_err_str, "backup"); - /* Try to read NT3.51- backup boot sector. */ - if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_backup->b_data, silent)) - goto hotfix_primary_boot_sector; - if (!silent) - ntfs_error(sb, "Could not find a valid backup boot " - "sector."); - brelse(bh_backup); - } else if (!silent) - ntfs_error(sb, read_err_str, "backup"); - /* We failed. Cleanup and return. */ - if (bh_primary) - brelse(bh_primary); - return NULL; -hotfix_primary_boot_sector: - if (bh_primary) { - /* - * If we managed to read sector zero and the volume is not - * read-only, copy the found, valid backup boot sector to the - * primary boot sector. Note we only copy the actual boot - * sector structure, not the actual whole device sector as that - * may be bigger and would potentially damage the $Boot system - * file (FIXME: Would be nice to know if the backup boot sector - * on a large sector device contains the whole boot loader or - * just the first 512 bytes). - */ - if (!sb_rdonly(sb)) { - ntfs_warning(sb, "Hot-fix: Recovering invalid primary " - "boot sector from backup copy."); - memcpy(bh_primary->b_data, bh_backup->b_data, - NTFS_BLOCK_SIZE); - mark_buffer_dirty(bh_primary); - sync_dirty_buffer(bh_primary); - if (buffer_uptodate(bh_primary)) { - brelse(bh_backup); - return bh_primary; - } - ntfs_error(sb, "Hot-fix: Device write error while " - "recovering primary boot sector."); - } else { - ntfs_warning(sb, "Hot-fix: Recovery of primary boot " - "sector failed: Read-only mount."); - } - brelse(bh_primary); - } - ntfs_warning(sb, "Using backup boot sector."); - return bh_backup; -} - -/** - * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol - * @vol: volume structure to initialise with data from boot sector - * @b: boot sector to parse - * - * Parse the ntfs boot sector @b and store all imporant information therein in - * the ntfs super block @vol. Return 'true' on success and 'false' on error. - */ -static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) -{ - unsigned int sectors_per_cluster_bits, nr_hidden_sects; - int clusters_per_mft_record, clusters_per_index_record; - s64 ll; - - vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector); - vol->sector_size_bits = ffs(vol->sector_size) - 1; - ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size, - vol->sector_size); - ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, - vol->sector_size_bits); - if (vol->sector_size < vol->sb->s_blocksize) { - ntfs_error(vol->sb, "Sector size (%i) is smaller than the " - "device block size (%lu). This is not " - "supported. Sorry.", vol->sector_size, - vol->sb->s_blocksize); - return false; - } - ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); - sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; - ntfs_debug("sectors_per_cluster_bits = 0x%x", - sectors_per_cluster_bits); - nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors); - ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects); - vol->cluster_size = vol->sector_size << sectors_per_cluster_bits; - vol->cluster_size_mask = vol->cluster_size - 1; - vol->cluster_size_bits = ffs(vol->cluster_size) - 1; - ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, - vol->cluster_size); - ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); - ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); - if (vol->cluster_size < vol->sector_size) { - ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " - "sector size (%i). This is not supported. " - "Sorry.", vol->cluster_size, vol->sector_size); - return false; - } - clusters_per_mft_record = b->clusters_per_mft_record; - ntfs_debug("clusters_per_mft_record = %i (0x%x)", - clusters_per_mft_record, clusters_per_mft_record); - if (clusters_per_mft_record > 0) - vol->mft_record_size = vol->cluster_size << - (ffs(clusters_per_mft_record) - 1); - else - /* - * When mft_record_size < cluster_size, clusters_per_mft_record - * = -log2(mft_record_size) bytes. mft_record_size normaly is - * 1024 bytes, which is encoded as 0xF6 (-10 in decimal). - */ - vol->mft_record_size = 1 << -clusters_per_mft_record; - vol->mft_record_size_mask = vol->mft_record_size - 1; - vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1; - ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size, - vol->mft_record_size); - ntfs_debug("vol->mft_record_size_mask = 0x%x", - vol->mft_record_size_mask); - ntfs_debug("vol->mft_record_size_bits = %i (0x%x)", - vol->mft_record_size_bits, vol->mft_record_size_bits); - /* - * We cannot support mft record sizes above the PAGE_SIZE since - * we store $MFT/$DATA, the table of mft records in the page cache. - */ - if (vol->mft_record_size > PAGE_SIZE) { - ntfs_error(vol->sb, "Mft record size (%i) exceeds the " - "PAGE_SIZE on your system (%lu). " - "This is not supported. Sorry.", - vol->mft_record_size, PAGE_SIZE); - return false; - } - /* We cannot support mft record sizes below the sector size. */ - if (vol->mft_record_size < vol->sector_size) { - ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " - "sector size (%i). This is not supported. " - "Sorry.", vol->mft_record_size, - vol->sector_size); - return false; - } - clusters_per_index_record = b->clusters_per_index_record; - ntfs_debug("clusters_per_index_record = %i (0x%x)", - clusters_per_index_record, clusters_per_index_record); - if (clusters_per_index_record > 0) - vol->index_record_size = vol->cluster_size << - (ffs(clusters_per_index_record) - 1); - else - /* - * When index_record_size < cluster_size, - * clusters_per_index_record = -log2(index_record_size) bytes. - * index_record_size normaly equals 4096 bytes, which is - * encoded as 0xF4 (-12 in decimal). - */ - vol->index_record_size = 1 << -clusters_per_index_record; - vol->index_record_size_mask = vol->index_record_size - 1; - vol->index_record_size_bits = ffs(vol->index_record_size) - 1; - ntfs_debug("vol->index_record_size = %i (0x%x)", - vol->index_record_size, vol->index_record_size); - ntfs_debug("vol->index_record_size_mask = 0x%x", - vol->index_record_size_mask); - ntfs_debug("vol->index_record_size_bits = %i (0x%x)", - vol->index_record_size_bits, - vol->index_record_size_bits); - /* We cannot support index record sizes below the sector size. */ - if (vol->index_record_size < vol->sector_size) { - ntfs_error(vol->sb, "Index record size (%i) is smaller than " - "the sector size (%i). This is not " - "supported. Sorry.", vol->index_record_size, - vol->sector_size); - return false; - } - /* - * Get the size of the volume in clusters and check for 64-bit-ness. - * Windows currently only uses 32 bits to save the clusters so we do - * the same as it is much faster on 32-bit CPUs. - */ - ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits; - if ((u64)ll >= 1ULL << 32) { - ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry."); - return false; - } - vol->nr_clusters = ll; - ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters); - /* - * On an architecture where unsigned long is 32-bits, we restrict the - * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler - * will hopefully optimize the whole check away. - */ - if (sizeof(unsigned long) < 8) { - if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) { - ntfs_error(vol->sb, "Volume size (%lluTiB) is too " - "large for this architecture. " - "Maximum supported is 2TiB. Sorry.", - (unsigned long long)ll >> (40 - - vol->cluster_size_bits)); - return false; - } - } - ll = sle64_to_cpu(b->mft_lcn); - if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " - "volume. Weird.", (unsigned long long)ll, - (unsigned long long)ll); - return false; - } - vol->mft_lcn = ll; - ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); - ll = sle64_to_cpu(b->mftmirr_lcn); - if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " - "of volume. Weird.", (unsigned long long)ll, - (unsigned long long)ll); - return false; - } - vol->mftmirr_lcn = ll; - ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn); -#ifdef NTFS_RW - /* - * Work out the size of the mft mirror in number of mft records. If the - * cluster size is less than or equal to the size taken by four mft - * records, the mft mirror stores the first four mft records. If the - * cluster size is bigger than the size taken by four mft records, the - * mft mirror contains as many mft records as will fit into one - * cluster. - */ - if (vol->cluster_size <= (4 << vol->mft_record_size_bits)) - vol->mftmirr_size = 4; - else - vol->mftmirr_size = vol->cluster_size >> - vol->mft_record_size_bits; - ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size); -#endif /* NTFS_RW */ - vol->serial_no = le64_to_cpu(b->volume_serial_number); - ntfs_debug("vol->serial_no = 0x%llx", - (unsigned long long)vol->serial_no); - return true; -} - -/** - * ntfs_setup_allocators - initialize the cluster and mft allocators - * @vol: volume structure for which to setup the allocators - * - * Setup the cluster (lcn) and mft allocators to the starting values. - */ -static void ntfs_setup_allocators(ntfs_volume *vol) -{ -#ifdef NTFS_RW - LCN mft_zone_size, mft_lcn; -#endif /* NTFS_RW */ - - ntfs_debug("vol->mft_zone_multiplier = 0x%x", - vol->mft_zone_multiplier); -#ifdef NTFS_RW - /* Determine the size of the MFT zone. */ - mft_zone_size = vol->nr_clusters; - switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */ - case 4: - mft_zone_size >>= 1; /* 50% */ - break; - case 3: - mft_zone_size = (mft_zone_size + - (mft_zone_size >> 1)) >> 2; /* 37.5% */ - break; - case 2: - mft_zone_size >>= 2; /* 25% */ - break; - /* case 1: */ - default: - mft_zone_size >>= 3; /* 12.5% */ - break; - } - /* Setup the mft zone. */ - vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn; - ntfs_debug("vol->mft_zone_pos = 0x%llx", - (unsigned long long)vol->mft_zone_pos); - /* - * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs - * source) and if the actual mft_lcn is in the expected place or even - * further to the front of the volume, extend the mft_zone to cover the - * beginning of the volume as well. This is in order to protect the - * area reserved for the mft bitmap as well within the mft_zone itself. - * On non-standard volumes we do not protect it as the overhead would - * be higher than the speed increase we would get by doing it. - */ - mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size; - if (mft_lcn * vol->cluster_size < 16 * 1024) - mft_lcn = (16 * 1024 + vol->cluster_size - 1) / - vol->cluster_size; - if (vol->mft_zone_start <= mft_lcn) - vol->mft_zone_start = 0; - ntfs_debug("vol->mft_zone_start = 0x%llx", - (unsigned long long)vol->mft_zone_start); - /* - * Need to cap the mft zone on non-standard volumes so that it does - * not point outside the boundaries of the volume. We do this by - * halving the zone size until we are inside the volume. - */ - vol->mft_zone_end = vol->mft_lcn + mft_zone_size; - while (vol->mft_zone_end >= vol->nr_clusters) { - mft_zone_size >>= 1; - vol->mft_zone_end = vol->mft_lcn + mft_zone_size; - } - ntfs_debug("vol->mft_zone_end = 0x%llx", - (unsigned long long)vol->mft_zone_end); - /* - * Set the current position within each data zone to the start of the - * respective zone. - */ - vol->data1_zone_pos = vol->mft_zone_end; - ntfs_debug("vol->data1_zone_pos = 0x%llx", - (unsigned long long)vol->data1_zone_pos); - vol->data2_zone_pos = 0; - ntfs_debug("vol->data2_zone_pos = 0x%llx", - (unsigned long long)vol->data2_zone_pos); - - /* Set the mft data allocation position to mft record 24. */ - vol->mft_data_pos = 24; - ntfs_debug("vol->mft_data_pos = 0x%llx", - (unsigned long long)vol->mft_data_pos); -#endif /* NTFS_RW */ -} - -#ifdef NTFS_RW - -/** - * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume - * @vol: ntfs super block describing device whose mft mirror to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_init_mft_mirror(ntfs_volume *vol) -{ - struct inode *tmp_ino; - ntfs_inode *tmp_ni; - - ntfs_debug("Entering."); - /* Get mft mirror inode. */ - tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - /* Caller will display error message. */ - return false; - } - /* - * Re-initialize some specifics about $MFTMirr's inode as - * ntfs_read_inode() will have set up the default ones. - */ - /* Set uid and gid to root. */ - tmp_ino->i_uid = GLOBAL_ROOT_UID; - tmp_ino->i_gid = GLOBAL_ROOT_GID; - /* Regular file. No access for anyone. */ - tmp_ino->i_mode = S_IFREG; - /* No VFS initiated operations allowed for $MFTMirr. */ - tmp_ino->i_op = &ntfs_empty_inode_ops; - tmp_ino->i_fop = &ntfs_empty_file_ops; - /* Put in our special address space operations. */ - tmp_ino->i_mapping->a_ops = &ntfs_mst_aops; - tmp_ni = NTFS_I(tmp_ino); - /* The $MFTMirr, like the $MFT is multi sector transfer protected. */ - NInoSetMstProtected(tmp_ni); - NInoSetSparseDisabled(tmp_ni); - /* - * Set up our little cheat allowing us to reuse the async read io - * completion handler for directories. - */ - tmp_ni->itype.index.block_size = vol->mft_record_size; - tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits; - vol->mftmirr_ino = tmp_ino; - ntfs_debug("Done."); - return true; -} - -/** - * check_mft_mirror - compare contents of the mft mirror with the mft - * @vol: ntfs super block describing device whose mft mirror to check - * - * Return 'true' on success or 'false' on error. - * - * Note, this function also results in the mft mirror runlist being completely - * mapped into memory. The mft mirror write code requires this and will BUG() - * should it find an unmapped runlist element. - */ -static bool check_mft_mirror(ntfs_volume *vol) -{ - struct super_block *sb = vol->sb; - ntfs_inode *mirr_ni; - struct page *mft_page, *mirr_page; - u8 *kmft, *kmirr; - runlist_element *rl, rl2[2]; - pgoff_t index; - int mrecs_per_page, i; - - ntfs_debug("Entering."); - /* Compare contents of $MFT and $MFTMirr. */ - mrecs_per_page = PAGE_SIZE / vol->mft_record_size; - BUG_ON(!mrecs_per_page); - BUG_ON(!vol->mftmirr_size); - mft_page = mirr_page = NULL; - kmft = kmirr = NULL; - index = i = 0; - do { - u32 bytes; - - /* Switch pages if necessary. */ - if (!(i % mrecs_per_page)) { - if (index) { - ntfs_unmap_page(mft_page); - ntfs_unmap_page(mirr_page); - } - /* Get the $MFT page. */ - mft_page = ntfs_map_page(vol->mft_ino->i_mapping, - index); - if (IS_ERR(mft_page)) { - ntfs_error(sb, "Failed to read $MFT."); - return false; - } - kmft = page_address(mft_page); - /* Get the $MFTMirr page. */ - mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping, - index); - if (IS_ERR(mirr_page)) { - ntfs_error(sb, "Failed to read $MFTMirr."); - goto mft_unmap_out; - } - kmirr = page_address(mirr_page); - ++index; - } - /* Do not check the record if it is not in use. */ - if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) { - /* Make sure the record is ok. */ - if (ntfs_is_baad_recordp((le32*)kmft)) { - ntfs_error(sb, "Incomplete multi sector " - "transfer detected in mft " - "record %i.", i); -mm_unmap_out: - ntfs_unmap_page(mirr_page); -mft_unmap_out: - ntfs_unmap_page(mft_page); - return false; - } - } - /* Do not check the mirror record if it is not in use. */ - if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) { - if (ntfs_is_baad_recordp((le32*)kmirr)) { - ntfs_error(sb, "Incomplete multi sector " - "transfer detected in mft " - "mirror record %i.", i); - goto mm_unmap_out; - } - } - /* Get the amount of data in the current record. */ - bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use); - if (bytes < sizeof(MFT_RECORD_OLD) || - bytes > vol->mft_record_size || - ntfs_is_baad_recordp((le32*)kmft)) { - bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use); - if (bytes < sizeof(MFT_RECORD_OLD) || - bytes > vol->mft_record_size || - ntfs_is_baad_recordp((le32*)kmirr)) - bytes = vol->mft_record_size; - } - /* Compare the two records. */ - if (memcmp(kmft, kmirr, bytes)) { - ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not " - "match. Run ntfsfix or chkdsk.", i); - goto mm_unmap_out; - } - kmft += vol->mft_record_size; - kmirr += vol->mft_record_size; - } while (++i < vol->mftmirr_size); - /* Release the last pages. */ - ntfs_unmap_page(mft_page); - ntfs_unmap_page(mirr_page); - - /* Construct the mft mirror runlist by hand. */ - rl2[0].vcn = 0; - rl2[0].lcn = vol->mftmirr_lcn; - rl2[0].length = (vol->mftmirr_size * vol->mft_record_size + - vol->cluster_size - 1) / vol->cluster_size; - rl2[1].vcn = rl2[0].length; - rl2[1].lcn = LCN_ENOENT; - rl2[1].length = 0; - /* - * Because we have just read all of the mft mirror, we know we have - * mapped the full runlist for it. - */ - mirr_ni = NTFS_I(vol->mftmirr_ino); - down_read(&mirr_ni->runlist.lock); - rl = mirr_ni->runlist.rl; - /* Compare the two runlists. They must be identical. */ - i = 0; - do { - if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn || - rl2[i].length != rl[i].length) { - ntfs_error(sb, "$MFTMirr location mismatch. " - "Run chkdsk."); - up_read(&mirr_ni->runlist.lock); - return false; - } - } while (rl2[i++].length); - up_read(&mirr_ni->runlist.lock); - ntfs_debug("Done."); - return true; -} - -/** - * load_and_check_logfile - load and check the logfile inode for a volume - * @vol: ntfs super block describing device whose logfile to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_check_logfile(ntfs_volume *vol, - RESTART_PAGE_HEADER **rp) -{ - struct inode *tmp_ino; - - ntfs_debug("Entering."); - tmp_ino = ntfs_iget(vol->sb, FILE_LogFile); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - /* Caller will display error message. */ - return false; - } - if (!ntfs_check_logfile(tmp_ino, rp)) { - iput(tmp_ino); - /* ntfs_check_logfile() will have displayed error output. */ - return false; - } - NInoSetSparseDisabled(NTFS_I(tmp_ino)); - vol->logfile_ino = tmp_ino; - ntfs_debug("Done."); - return true; -} - -#define NTFS_HIBERFIL_HEADER_SIZE 4096 - -/** - * check_windows_hibernation_status - check if Windows is suspended on a volume - * @vol: ntfs super block of device to check - * - * Check if Windows is hibernated on the ntfs volume @vol. This is done by - * looking for the file hiberfil.sys in the root directory of the volume. If - * the file is not present Windows is definitely not suspended. - * - * If hiberfil.sys exists and is less than 4kiB in size it means Windows is - * definitely suspended (this volume is not the system volume). Caveat: on a - * system with many volumes it is possible that the < 4kiB check is bogus but - * for now this should do fine. - * - * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the - * hiberfil header (which is the first 4kiB). If this begins with "hibr", - * Windows is definitely suspended. If it is completely full of zeroes, - * Windows is definitely not hibernated. Any other case is treated as if - * Windows is suspended. This caters for the above mentioned caveat of a - * system with many volumes where no "hibr" magic would be present and there is - * no zero header. - * - * Return 0 if Windows is not hibernated on the volume, >0 if Windows is - * hibernated on the volume, and -errno on error. - */ -static int check_windows_hibernation_status(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *vi; - struct page *page; - u32 *kaddr, *kend; - ntfs_name *name = NULL; - int ret = 1; - static const ntfschar hiberfil[13] = { cpu_to_le16('h'), - cpu_to_le16('i'), cpu_to_le16('b'), - cpu_to_le16('e'), cpu_to_le16('r'), - cpu_to_le16('f'), cpu_to_le16('i'), - cpu_to_le16('l'), cpu_to_le16('.'), - cpu_to_le16('s'), cpu_to_le16('y'), - cpu_to_le16('s'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the hibernation file by looking up the - * filename hiberfil.sys in the root directory. - */ - inode_lock(vol->root_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, - &name); - inode_unlock(vol->root_ino); - if (IS_ERR_MREF(mref)) { - ret = MREF_ERR(mref); - /* If the file does not exist, Windows is not hibernated. */ - if (ret == -ENOENT) { - ntfs_debug("hiberfil.sys not present. Windows is not " - "hibernated on the volume."); - return 0; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for " - "hiberfil.sys."); - return ret; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - vi = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(vi) || is_bad_inode(vi)) { - if (!IS_ERR(vi)) - iput(vi); - ntfs_error(vol->sb, "Failed to load hiberfil.sys."); - return IS_ERR(vi) ? PTR_ERR(vi) : -EIO; - } - if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) { - ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). " - "Windows is hibernated on the volume. This " - "is not the system volume.", i_size_read(vi)); - goto iput_out; - } - page = ntfs_map_page(vi->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); - ret = PTR_ERR(page); - goto iput_out; - } - kaddr = (u32*)page_address(page); - if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { - ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " - "hibernated on the volume. This is the " - "system volume."); - goto unm_iput_out; - } - kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr); - do { - if (unlikely(*kaddr)) { - ntfs_debug("hiberfil.sys is larger than 4kiB " - "(0x%llx), does not contain the " - "\"hibr\" magic, and does not have a " - "zero header. Windows is hibernated " - "on the volume. This is not the " - "system volume.", i_size_read(vi)); - goto unm_iput_out; - } - } while (++kaddr < kend); - ntfs_debug("hiberfil.sys contains a zero header. Windows is not " - "hibernated on the volume. This is the system " - "volume."); - ret = 0; -unm_iput_out: - ntfs_unmap_page(page); -iput_out: - iput(vi); - return ret; -} - -/** - * load_and_init_quota - load and setup the quota file for a volume if present - * @vol: ntfs super block describing device whose quota file to load - * - * Return 'true' on success or 'false' on error. If $Quota is not present, we - * leave vol->quota_ino as NULL and return success. - */ -static bool load_and_init_quota(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *tmp_ino; - ntfs_name *name = NULL; - static const ntfschar Quota[7] = { cpu_to_le16('$'), - cpu_to_le16('Q'), cpu_to_le16('u'), - cpu_to_le16('o'), cpu_to_le16('t'), - cpu_to_le16('a'), 0 }; - static ntfschar Q[3] = { cpu_to_le16('$'), - cpu_to_le16('Q'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the quota file by looking up the filename - * $Quota in the extended system files directory $Extend. - */ - inode_lock(vol->extend_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, - &name); - inode_unlock(vol->extend_ino); - if (IS_ERR_MREF(mref)) { - /* - * If the file does not exist, quotas are disabled and have - * never been enabled on this volume, just return success. - */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("$Quota not present. Volume does not have " - "quotas enabled."); - /* - * No need to try to set quotas out of date if they are - * not enabled. - */ - NVolSetQuotaOutOfDate(vol); - return true; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for $Quota."); - return false; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - ntfs_error(vol->sb, "Failed to load $Quota."); - return false; - } - vol->quota_ino = tmp_ino; - /* Get the $Q index allocation attribute. */ - tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $Quota/$Q index."); - return false; - } - vol->quota_q_ino = tmp_ino; - ntfs_debug("Done."); - return true; -} - -/** - * load_and_init_usnjrnl - load and setup the transaction log if present - * @vol: ntfs super block describing device whose usnjrnl file to load - * - * Return 'true' on success or 'false' on error. - * - * If $UsnJrnl is not present or in the process of being disabled, we set - * NVolUsnJrnlStamped() and return success. - * - * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn, - * i.e. transaction logging has only just been enabled or the journal has been - * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped() - * and return success. - */ -static bool load_and_init_usnjrnl(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *tmp_ino; - ntfs_inode *tmp_ni; - struct page *page; - ntfs_name *name = NULL; - USN_HEADER *uh; - static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), - cpu_to_le16('U'), cpu_to_le16('s'), - cpu_to_le16('n'), cpu_to_le16('J'), - cpu_to_le16('r'), cpu_to_le16('n'), - cpu_to_le16('l'), 0 }; - static ntfschar Max[5] = { cpu_to_le16('$'), - cpu_to_le16('M'), cpu_to_le16('a'), - cpu_to_le16('x'), 0 }; - static ntfschar J[3] = { cpu_to_le16('$'), - cpu_to_le16('J'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the transaction log file by looking up the - * filename $UsnJrnl in the extended system files directory $Extend. - */ - inode_lock(vol->extend_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, - &name); - inode_unlock(vol->extend_ino); - if (IS_ERR_MREF(mref)) { - /* - * If the file does not exist, transaction logging is disabled, - * just return success. - */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("$UsnJrnl not present. Volume does not " - "have transaction logging enabled."); -not_enabled: - /* - * No need to try to stamp the transaction log if - * transaction logging is not enabled. - */ - NVolSetUsnJrnlStamped(vol); - return true; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for " - "$UsnJrnl."); - return false; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - ntfs_error(vol->sb, "Failed to load $UsnJrnl."); - return false; - } - vol->usnjrnl_ino = tmp_ino; - /* - * If the transaction log is in the process of being deleted, we can - * ignore it. - */ - if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) { - ntfs_debug("$UsnJrnl in the process of being disabled. " - "Volume does not have transaction logging " - "enabled."); - goto not_enabled; - } - /* Get the $DATA/$Max attribute. */ - tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max " - "attribute."); - return false; - } - vol->usnjrnl_max_ino = tmp_ino; - if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { - ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " - "attribute (size is 0x%llx but should be at " - "least 0x%zx bytes).", i_size_read(tmp_ino), - sizeof(USN_HEADER)); - return false; - } - /* Get the $DATA/$J attribute. */ - tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J " - "attribute."); - return false; - } - vol->usnjrnl_j_ino = tmp_ino; - /* Verify $J is non-resident and sparse. */ - tmp_ni = NTFS_I(vol->usnjrnl_j_ino); - if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) { - ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident " - "and/or not sparse."); - return false; - } - /* Read the USN_HEADER from $DATA/$Max. */ - page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max " - "attribute."); - return false; - } - uh = (USN_HEADER*)page_address(page); - /* Sanity check the $Max. */ - if (unlikely(sle64_to_cpu(uh->allocation_delta) > - sle64_to_cpu(uh->maximum_size))) { - ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds " - "maximum size (0x%llx). $UsnJrnl is corrupt.", - (long long)sle64_to_cpu(uh->allocation_delta), - (long long)sle64_to_cpu(uh->maximum_size)); - ntfs_unmap_page(page); - return false; - } - /* - * If the transaction log has been stamped and nothing has been written - * to it since, we do not need to stamp it. - */ - if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >= - i_size_read(vol->usnjrnl_j_ino))) { - if (likely(sle64_to_cpu(uh->lowest_valid_usn) == - i_size_read(vol->usnjrnl_j_ino))) { - ntfs_unmap_page(page); - ntfs_debug("$UsnJrnl is enabled but nothing has been " - "logged since it was last stamped. " - "Treating this as if the volume does " - "not have transaction logging " - "enabled."); - goto not_enabled; - } - ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) " - "which is out of bounds (0x%llx). $UsnJrnl " - "is corrupt.", - (long long)sle64_to_cpu(uh->lowest_valid_usn), - i_size_read(vol->usnjrnl_j_ino)); - ntfs_unmap_page(page); - return false; - } - ntfs_unmap_page(page); - ntfs_debug("Done."); - return true; -} - -/** - * load_and_init_attrdef - load the attribute definitions table for a volume - * @vol: ntfs super block describing device whose attrdef to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_init_attrdef(ntfs_volume *vol) -{ - loff_t i_size; - struct super_block *sb = vol->sb; - struct inode *ino; - struct page *page; - pgoff_t index, max_index; - unsigned int size; - - ntfs_debug("Entering."); - /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */ - ino = ntfs_iget(sb, FILE_AttrDef); - if (IS_ERR(ino) || is_bad_inode(ino)) { - if (!IS_ERR(ino)) - iput(ino); - goto failed; - } - NInoSetSparseDisabled(NTFS_I(ino)); - /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */ - i_size = i_size_read(ino); - if (i_size <= 0 || i_size > 0x7fffffff) - goto iput_failed; - vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size); - if (!vol->attrdef) - goto iput_failed; - index = 0; - max_index = i_size >> PAGE_SHIFT; - size = PAGE_SIZE; - while (index < max_index) { - /* Read the attrdef table and copy it into the linear buffer. */ -read_partial_attrdef_page: - page = ntfs_map_page(ino->i_mapping, index); - if (IS_ERR(page)) - goto free_iput_failed; - memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), - page_address(page), size); - ntfs_unmap_page(page); - } - if (size == PAGE_SIZE) { - size = i_size & ~PAGE_MASK; - if (size) - goto read_partial_attrdef_page; - } - vol->attrdef_size = i_size; - ntfs_debug("Read %llu bytes from $AttrDef.", i_size); - iput(ino); - return true; -free_iput_failed: - ntfs_free(vol->attrdef); - vol->attrdef = NULL; -iput_failed: - iput(ino); -failed: - ntfs_error(sb, "Failed to initialize attribute definition table."); - return false; -} - -#endif /* NTFS_RW */ - -/** - * load_and_init_upcase - load the upcase table for an ntfs volume - * @vol: ntfs super block describing device whose upcase to load - * - * Return 'true' on success or 'false' on error. - */ -static bool load_and_init_upcase(ntfs_volume *vol) -{ - loff_t i_size; - struct super_block *sb = vol->sb; - struct inode *ino; - struct page *page; - pgoff_t index, max_index; - unsigned int size; - int i, max; - - ntfs_debug("Entering."); - /* Read upcase table and setup vol->upcase and vol->upcase_len. */ - ino = ntfs_iget(sb, FILE_UpCase); - if (IS_ERR(ino) || is_bad_inode(ino)) { - if (!IS_ERR(ino)) - iput(ino); - goto upcase_failed; - } - /* - * The upcase size must not be above 64k Unicode characters, must not - * be zero and must be a multiple of sizeof(ntfschar). - */ - i_size = i_size_read(ino); - if (!i_size || i_size & (sizeof(ntfschar) - 1) || - i_size > 64ULL * 1024 * sizeof(ntfschar)) - goto iput_upcase_failed; - vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size); - if (!vol->upcase) - goto iput_upcase_failed; - index = 0; - max_index = i_size >> PAGE_SHIFT; - size = PAGE_SIZE; - while (index < max_index) { - /* Read the upcase table and copy it into the linear buffer. */ -read_partial_upcase_page: - page = ntfs_map_page(ino->i_mapping, index); - if (IS_ERR(page)) - goto iput_upcase_failed; - memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), - page_address(page), size); - ntfs_unmap_page(page); - } - if (size == PAGE_SIZE) { - size = i_size & ~PAGE_MASK; - if (size) - goto read_partial_upcase_page; - } - vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS; - ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).", - i_size, 64 * 1024 * sizeof(ntfschar)); - iput(ino); - mutex_lock(&ntfs_lock); - if (!default_upcase) { - ntfs_debug("Using volume specified $UpCase since default is " - "not present."); - mutex_unlock(&ntfs_lock); - return true; - } - max = default_upcase_len; - if (max > vol->upcase_len) - max = vol->upcase_len; - for (i = 0; i < max; i++) - if (vol->upcase[i] != default_upcase[i]) - break; - if (i == max) { - ntfs_free(vol->upcase); - vol->upcase = default_upcase; - vol->upcase_len = max; - ntfs_nr_upcase_users++; - mutex_unlock(&ntfs_lock); - ntfs_debug("Volume specified $UpCase matches default. Using " - "default."); - return true; - } - mutex_unlock(&ntfs_lock); - ntfs_debug("Using volume specified $UpCase since it does not match " - "the default."); - return true; -iput_upcase_failed: - iput(ino); - ntfs_free(vol->upcase); - vol->upcase = NULL; -upcase_failed: - mutex_lock(&ntfs_lock); - if (default_upcase) { - vol->upcase = default_upcase; - vol->upcase_len = default_upcase_len; - ntfs_nr_upcase_users++; - mutex_unlock(&ntfs_lock); - ntfs_error(sb, "Failed to load $UpCase from the volume. Using " - "default."); - return true; - } - mutex_unlock(&ntfs_lock); - ntfs_error(sb, "Failed to initialize upcase table."); - return false; -} - -/* - * The lcn and mft bitmap inodes are NTFS-internal inodes with - * their own special locking rules: - */ -static struct lock_class_key - lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key, - mftbmp_runlist_lock_key, mftbmp_mrec_lock_key; - -/** - * load_system_files - open the system files using normal functions - * @vol: ntfs super block describing device whose system files to load - * - * Open the system files with normal access functions and complete setting up - * the ntfs super block @vol. - * - * Return 'true' on success or 'false' on error. - */ -static bool load_system_files(ntfs_volume *vol) -{ - struct super_block *sb = vol->sb; - MFT_RECORD *m; - VOLUME_INFORMATION *vi; - ntfs_attr_search_ctx *ctx; -#ifdef NTFS_RW - RESTART_PAGE_HEADER *rp; - int err; -#endif /* NTFS_RW */ - - ntfs_debug("Entering."); -#ifdef NTFS_RW - /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */ - if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) { - static const char *es1 = "Failed to load $MFTMirr"; - static const char *es2 = "$MFTMirr does not match $MFT"; - static const char *es3 = ". Run ntfsfix and/or chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - !vol->mftmirr_ino ? es1 : es2, - es3); - goto iput_mirr_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", - !vol->mftmirr_ino ? es1 : es2, es3); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", - !vol->mftmirr_ino ? es1 : es2, es3); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ - /* Get mft bitmap attribute inode. */ - vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0); - if (IS_ERR(vol->mftbmp_ino)) { - ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute."); - goto iput_mirr_err_out; - } - lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock, - &mftbmp_runlist_lock_key); - lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock, - &mftbmp_mrec_lock_key); - /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */ - if (!load_and_init_upcase(vol)) - goto iput_mftbmp_err_out; -#ifdef NTFS_RW - /* - * Read attribute definitions table and setup @vol->attrdef and - * @vol->attrdef_size. - */ - if (!load_and_init_attrdef(vol)) - goto iput_upcase_err_out; -#endif /* NTFS_RW */ - /* - * Get the cluster allocation bitmap inode and verify the size, no - * need for any locking at this stage as we are already running - * exclusively as we are mount in progress task. - */ - vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap); - if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) { - if (!IS_ERR(vol->lcnbmp_ino)) - iput(vol->lcnbmp_ino); - goto bitmap_failed; - } - lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock, - &lcnbmp_runlist_lock_key); - lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock, - &lcnbmp_mrec_lock_key); - - NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino)); - if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) { - iput(vol->lcnbmp_ino); -bitmap_failed: - ntfs_error(sb, "Failed to load $Bitmap."); - goto iput_attrdef_err_out; - } - /* - * Get the volume inode and setup our cache of the volume flags and - * version. - */ - vol->vol_ino = ntfs_iget(sb, FILE_Volume); - if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) { - if (!IS_ERR(vol->vol_ino)) - iput(vol->vol_ino); -volume_failed: - ntfs_error(sb, "Failed to load $Volume."); - goto iput_lcnbmp_err_out; - } - m = map_mft_record(NTFS_I(vol->vol_ino)); - if (IS_ERR(m)) { -iput_volume_failed: - iput(vol->vol_ino); - goto volume_failed; - } - if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) { - ntfs_error(sb, "Failed to get attribute search context."); - goto get_ctx_vol_failed; - } - if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, - ctx) || ctx->attr->non_resident || ctx->attr->flags) { -err_put_vol: - ntfs_attr_put_search_ctx(ctx); -get_ctx_vol_failed: - unmap_mft_record(NTFS_I(vol->vol_ino)); - goto iput_volume_failed; - } - vi = (VOLUME_INFORMATION*)((char*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - /* Some bounds checks. */ - if ((u8*)vi < (u8*)ctx->attr || (u8*)vi + - le32_to_cpu(ctx->attr->data.resident.value_length) > - (u8*)ctx->attr + le32_to_cpu(ctx->attr->length)) - goto err_put_vol; - /* Copy the volume flags and version to the ntfs_volume structure. */ - vol->vol_flags = vi->flags; - vol->major_ver = vi->major_ver; - vol->minor_ver = vi->minor_ver; - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(NTFS_I(vol->vol_ino)); - pr_info("volume version %i.%i.\n", vol->major_ver, - vol->minor_ver); - if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { - ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " - "volume version %i.%i (need at least version " - "3.0).", vol->major_ver, vol->minor_ver); - NVolClearSparseEnabled(vol); - } -#ifdef NTFS_RW - /* Make sure that no unsupported volume flags are set. */ - if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - static const char *es1a = "Volume is dirty"; - static const char *es1b = "Volume has been modified by chkdsk"; - static const char *es1c = "Volume has unsupported flags set"; - static const char *es2a = ". Run chkdsk and mount in Windows."; - static const char *es2b = ". Mount in Windows."; - const char *es1, *es2; - - es2 = es2a; - if (vol->vol_flags & VOLUME_IS_DIRTY) - es1 = es1a; - else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { - es1 = es1b; - es2 = es2b; - } else { - es1 = es1c; - ntfs_warning(sb, "Unsupported volume flags 0x%x " - "encountered.", - (unsigned)le16_to_cpu(vol->vol_flags)); - } - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_vol_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* - * Do not set NVolErrors() because ntfs_remount() re-checks the - * flags which we need to do in case any flags have changed. - */ - } - /* - * Get the inode for the logfile, check it and determine if the volume - * was shutdown cleanly. - */ - rp = NULL; - if (!load_and_check_logfile(vol, &rp) || - !ntfs_is_logfile_clean(vol->logfile_ino, rp)) { - static const char *es1a = "Failed to load $LogFile"; - static const char *es1b = "$LogFile is not clean"; - static const char *es2 = ". Mount in Windows."; - const char *es1; - - es1 = !vol->logfile_ino ? es1a : es1b; - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - if (vol->logfile_ino) { - BUG_ON(!rp); - ntfs_free(rp); - } - goto iput_logfile_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - ntfs_free(rp); -#endif /* NTFS_RW */ - /* Get the root directory inode so we can do path lookups. */ - vol->root_ino = ntfs_iget(sb, FILE_root); - if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { - if (!IS_ERR(vol->root_ino)) - iput(vol->root_ino); - ntfs_error(sb, "Failed to load root directory."); - goto iput_logfile_err_out; - } -#ifdef NTFS_RW - /* - * Check if Windows is suspended to disk on the target volume. If it - * is hibernated, we must not write *anything* to the disk so set - * NVolErrors() without setting the dirty volume flag and mount - * read-only. This will prevent read-write remounting and it will also - * prevent all writes. - */ - err = check_windows_hibernation_status(vol); - if (unlikely(err)) { - static const char *es1a = "Failed to determine if Windows is " - "hibernated"; - static const char *es1b = "Windows is hibernated"; - static const char *es2 = ". Run chkdsk."; - const char *es1; - - es1 = err < 0 ? es1a : es1b; - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, mark the volume dirty. */ - if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { - static const char *es1 = "Failed to set dirty bit in volume " - "information flags"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - /* - * Do not set NVolErrors() because ntfs_remount() might manage - * to set the dirty flag in which case all would be well. - */ - } -#if 0 - // TODO: Enable this code once we start modifying anything that is - // different between NTFS 1.2 and 3.x... - /* - * If (still) a read-write mount, set the NT4 compatibility flag on - * newer NTFS version volumes. - */ - if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) && - ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { - static const char *es1 = "Failed to set NT4 compatibility flag"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif - /* If (still) a read-write mount, empty the logfile. */ - if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) { - static const char *es1 = "Failed to empty $LogFile"; - static const char *es2 = ". Mount in Windows."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ - /* If on NTFS versions before 3.0, we are done. */ - if (unlikely(vol->major_ver < 3)) - return true; - /* NTFS 3.0+ specific initialization. */ - /* Get the security descriptors inode. */ - vol->secure_ino = ntfs_iget(sb, FILE_Secure); - if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) { - if (!IS_ERR(vol->secure_ino)) - iput(vol->secure_ino); - ntfs_error(sb, "Failed to load $Secure."); - goto iput_root_err_out; - } - // TODO: Initialize security. - /* Get the extended system files' directory inode. */ - vol->extend_ino = ntfs_iget(sb, FILE_Extend); - if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || - !S_ISDIR(vol->extend_ino->i_mode)) { - if (!IS_ERR(vol->extend_ino)) - iput(vol->extend_ino); - ntfs_error(sb, "Failed to load $Extend."); - goto iput_sec_err_out; - } -#ifdef NTFS_RW - /* Find the quota file, load it if present, and set it up. */ - if (!load_and_init_quota(vol)) { - static const char *es1 = "Failed to load $Quota"; - static const char *es2 = ". Run chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_quota_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, mark the quotas out of date. */ - if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) { - static const char *es1 = "Failed to mark quotas out of date"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_quota_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } - /* - * Find the transaction log file ($UsnJrnl), load it if present, check - * it, and set it up. - */ - if (!load_and_init_usnjrnl(vol)) { - static const char *es1 = "Failed to load $UsnJrnl"; - static const char *es2 = ". Run chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_usnjrnl_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, stamp the transaction log. */ - if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) { - static const char *es1 = "Failed to stamp transaction log " - "($UsnJrnl)"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_usnjrnl_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ - return true; -#ifdef NTFS_RW -iput_usnjrnl_err_out: - iput(vol->usnjrnl_j_ino); - iput(vol->usnjrnl_max_ino); - iput(vol->usnjrnl_ino); -iput_quota_err_out: - iput(vol->quota_q_ino); - iput(vol->quota_ino); - iput(vol->extend_ino); -#endif /* NTFS_RW */ -iput_sec_err_out: - iput(vol->secure_ino); -iput_root_err_out: - iput(vol->root_ino); -iput_logfile_err_out: -#ifdef NTFS_RW - iput(vol->logfile_ino); -iput_vol_err_out: -#endif /* NTFS_RW */ - iput(vol->vol_ino); -iput_lcnbmp_err_out: - iput(vol->lcnbmp_ino); -iput_attrdef_err_out: - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } -#ifdef NTFS_RW -iput_upcase_err_out: -#endif /* NTFS_RW */ - vol->upcase_len = 0; - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; - } -iput_mftbmp_err_out: - iput(vol->mftbmp_ino); -iput_mirr_err_out: -#ifdef NTFS_RW - iput(vol->mftmirr_ino); -#endif /* NTFS_RW */ - return false; -} - -/** - * ntfs_put_super - called by the vfs to unmount a volume - * @sb: vfs superblock of volume to unmount - * - * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when - * the volume is being unmounted (umount system call has been invoked) and it - * releases all inodes and memory belonging to the NTFS specific part of the - * super block. - */ -static void ntfs_put_super(struct super_block *sb) -{ - ntfs_volume *vol = NTFS_SB(sb); - - ntfs_debug("Entering."); - -#ifdef NTFS_RW - /* - * Commit all inodes while they are still open in case some of them - * cause others to be dirtied. - */ - ntfs_commit_inode(vol->vol_ino); - - /* NTFS 3.0+ specific. */ - if (vol->major_ver >= 3) { - if (vol->usnjrnl_j_ino) - ntfs_commit_inode(vol->usnjrnl_j_ino); - if (vol->usnjrnl_max_ino) - ntfs_commit_inode(vol->usnjrnl_max_ino); - if (vol->usnjrnl_ino) - ntfs_commit_inode(vol->usnjrnl_ino); - if (vol->quota_q_ino) - ntfs_commit_inode(vol->quota_q_ino); - if (vol->quota_ino) - ntfs_commit_inode(vol->quota_ino); - if (vol->extend_ino) - ntfs_commit_inode(vol->extend_ino); - if (vol->secure_ino) - ntfs_commit_inode(vol->secure_ino); - } - - ntfs_commit_inode(vol->root_ino); - - down_write(&vol->lcnbmp_lock); - ntfs_commit_inode(vol->lcnbmp_ino); - up_write(&vol->lcnbmp_lock); - - down_write(&vol->mftbmp_lock); - ntfs_commit_inode(vol->mftbmp_ino); - up_write(&vol->mftbmp_lock); - - if (vol->logfile_ino) - ntfs_commit_inode(vol->logfile_ino); - - if (vol->mftmirr_ino) - ntfs_commit_inode(vol->mftmirr_ino); - ntfs_commit_inode(vol->mft_ino); - - /* - * If a read-write mount and no volume errors have occurred, mark the - * volume clean. Also, re-commit all affected inodes. - */ - if (!sb_rdonly(sb)) { - if (!NVolErrors(vol)) { - if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) - ntfs_warning(sb, "Failed to clear dirty bit " - "in volume information " - "flags. Run chkdsk."); - ntfs_commit_inode(vol->vol_ino); - ntfs_commit_inode(vol->root_ino); - if (vol->mftmirr_ino) - ntfs_commit_inode(vol->mftmirr_ino); - ntfs_commit_inode(vol->mft_ino); - } else { - ntfs_warning(sb, "Volume has errors. Leaving volume " - "marked dirty. Run chkdsk."); - } - } -#endif /* NTFS_RW */ - - iput(vol->vol_ino); - vol->vol_ino = NULL; - - /* NTFS 3.0+ specific clean up. */ - if (vol->major_ver >= 3) { -#ifdef NTFS_RW - if (vol->usnjrnl_j_ino) { - iput(vol->usnjrnl_j_ino); - vol->usnjrnl_j_ino = NULL; - } - if (vol->usnjrnl_max_ino) { - iput(vol->usnjrnl_max_ino); - vol->usnjrnl_max_ino = NULL; - } - if (vol->usnjrnl_ino) { - iput(vol->usnjrnl_ino); - vol->usnjrnl_ino = NULL; - } - if (vol->quota_q_ino) { - iput(vol->quota_q_ino); - vol->quota_q_ino = NULL; - } - if (vol->quota_ino) { - iput(vol->quota_ino); - vol->quota_ino = NULL; - } -#endif /* NTFS_RW */ - if (vol->extend_ino) { - iput(vol->extend_ino); - vol->extend_ino = NULL; - } - if (vol->secure_ino) { - iput(vol->secure_ino); - vol->secure_ino = NULL; - } - } - - iput(vol->root_ino); - vol->root_ino = NULL; - - down_write(&vol->lcnbmp_lock); - iput(vol->lcnbmp_ino); - vol->lcnbmp_ino = NULL; - up_write(&vol->lcnbmp_lock); - - down_write(&vol->mftbmp_lock); - iput(vol->mftbmp_ino); - vol->mftbmp_ino = NULL; - up_write(&vol->mftbmp_lock); - -#ifdef NTFS_RW - if (vol->logfile_ino) { - iput(vol->logfile_ino); - vol->logfile_ino = NULL; - } - if (vol->mftmirr_ino) { - /* Re-commit the mft mirror and mft just in case. */ - ntfs_commit_inode(vol->mftmirr_ino); - ntfs_commit_inode(vol->mft_ino); - iput(vol->mftmirr_ino); - vol->mftmirr_ino = NULL; - } - /* - * We should have no dirty inodes left, due to - * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as - * the underlying mft records are written out and cleaned. - */ - ntfs_commit_inode(vol->mft_ino); - write_inode_now(vol->mft_ino, 1); -#endif /* NTFS_RW */ - - iput(vol->mft_ino); - vol->mft_ino = NULL; - - /* Throw away the table of attribute definitions. */ - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } - vol->upcase_len = 0; - /* - * Destroy the global default upcase table if necessary. Also decrease - * the number of upcase users if we are a user. - */ - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - if (!ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) - free_compression_buffers(); - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; - } - - unload_nls(vol->nls_map); - - sb->s_fs_info = NULL; - kfree(vol); -} - -/** - * get_nr_free_clusters - return the number of free clusters on a volume - * @vol: ntfs volume for which to obtain free cluster count - * - * Calculate the number of free clusters on the mounted NTFS volume @vol. We - * actually calculate the number of clusters in use instead because this - * allows us to not care about partial pages as these will be just zero filled - * and hence not be counted as allocated clusters. - * - * The only particularity is that clusters beyond the end of the logical ntfs - * volume will be marked as allocated to prevent errors which means we have to - * discount those at the end. This is important as the cluster bitmap always - * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside - * the logical volume and marked in use when they are not as they do not exist. - * - * If any pages cannot be read we assume all clusters in the erroring pages are - * in use. This means we return an underestimate on errors which is better than - * an overestimate. - */ -static s64 get_nr_free_clusters(ntfs_volume *vol) -{ - s64 nr_free = vol->nr_clusters; - struct address_space *mapping = vol->lcnbmp_ino->i_mapping; - struct page *page; - pgoff_t index, max_index; - - ntfs_debug("Entering."); - /* Serialize accesses to the cluster bitmap. */ - down_read(&vol->lcnbmp_lock); - /* - * Convert the number of bits into bytes rounded up, then convert into - * multiples of PAGE_SIZE, rounding up so that if we have one - * full and one partial page max_index = 2. - */ - max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ - ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", - max_index, PAGE_SIZE / 4); - for (index = 0; index < max_index; index++) { - unsigned long *kaddr; - - /* - * Read the page from page cache, getting it from backing store - * if necessary, and increment the use count. - */ - page = read_mapping_page(mapping, index, NULL); - /* Ignore pages which errored synchronously. */ - if (IS_ERR(page)) { - ntfs_debug("read_mapping_page() error. Skipping " - "page (index 0x%lx).", index); - nr_free -= PAGE_SIZE * 8; - continue; - } - kaddr = kmap_atomic(page); - /* - * Subtract the number of set bits. If this - * is the last page and it is partial we don't really care as - * it just means we do a little extra work but it won't affect - * the result as all out of range bytes are set to zero by - * ntfs_readpage(). - */ - nr_free -= bitmap_weight(kaddr, - PAGE_SIZE * BITS_PER_BYTE); - kunmap_atomic(kaddr); - put_page(page); - } - ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); - /* - * Fixup for eventual bits outside logical ntfs volume (see function - * description above). - */ - if (vol->nr_clusters & 63) - nr_free += 64 - (vol->nr_clusters & 63); - up_read(&vol->lcnbmp_lock); - /* If errors occurred we may well have gone below zero, fix this. */ - if (nr_free < 0) - nr_free = 0; - ntfs_debug("Exiting."); - return nr_free; -} - -/** - * __get_nr_free_mft_records - return the number of free inodes on a volume - * @vol: ntfs volume for which to obtain free inode count - * @nr_free: number of mft records in filesystem - * @max_index: maximum number of pages containing set bits - * - * Calculate the number of free mft records (inodes) on the mounted NTFS - * volume @vol. We actually calculate the number of mft records in use instead - * because this allows us to not care about partial pages as these will be just - * zero filled and hence not be counted as allocated mft record. - * - * If any pages cannot be read we assume all mft records in the erroring pages - * are in use. This means we return an underestimate on errors which is better - * than an overestimate. - * - * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing. - */ -static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, - s64 nr_free, const pgoff_t max_index) -{ - struct address_space *mapping = vol->mftbmp_ino->i_mapping; - struct page *page; - pgoff_t index; - - ntfs_debug("Entering."); - /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ - ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " - "0x%lx.", max_index, PAGE_SIZE / 4); - for (index = 0; index < max_index; index++) { - unsigned long *kaddr; - - /* - * Read the page from page cache, getting it from backing store - * if necessary, and increment the use count. - */ - page = read_mapping_page(mapping, index, NULL); - /* Ignore pages which errored synchronously. */ - if (IS_ERR(page)) { - ntfs_debug("read_mapping_page() error. Skipping " - "page (index 0x%lx).", index); - nr_free -= PAGE_SIZE * 8; - continue; - } - kaddr = kmap_atomic(page); - /* - * Subtract the number of set bits. If this - * is the last page and it is partial we don't really care as - * it just means we do a little extra work but it won't affect - * the result as all out of range bytes are set to zero by - * ntfs_readpage(). - */ - nr_free -= bitmap_weight(kaddr, - PAGE_SIZE * BITS_PER_BYTE); - kunmap_atomic(kaddr); - put_page(page); - } - ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", - index - 1); - /* If errors occurred we may well have gone below zero, fix this. */ - if (nr_free < 0) - nr_free = 0; - ntfs_debug("Exiting."); - return nr_free; -} - -/** - * ntfs_statfs - return information about mounted NTFS volume - * @dentry: dentry from mounted volume - * @sfs: statfs structure in which to return the information - * - * Return information about the mounted NTFS volume @dentry in the statfs structure - * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is - * called). We interpret the values to be correct of the moment in time at - * which we are called. Most values are variable otherwise and this isn't just - * the free values but the totals as well. For example we can increase the - * total number of file nodes if we run out and we can keep doing this until - * there is no more space on the volume left at all. - * - * Called from vfs_statfs which is used to handle the statfs, fstatfs, and - * ustat system calls. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) -{ - struct super_block *sb = dentry->d_sb; - s64 size; - ntfs_volume *vol = NTFS_SB(sb); - ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); - pgoff_t max_index; - unsigned long flags; - - ntfs_debug("Entering."); - /* Type of filesystem. */ - sfs->f_type = NTFS_SB_MAGIC; - /* Optimal transfer block size. */ - sfs->f_bsize = PAGE_SIZE; - /* - * Total data blocks in filesystem in units of f_bsize and since - * inodes are also stored in data blocs ($MFT is a file) this is just - * the total clusters. - */ - sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> - PAGE_SHIFT; - /* Free data blocks in filesystem in units of f_bsize. */ - size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> - PAGE_SHIFT; - if (size < 0LL) - size = 0LL; - /* Free blocks avail to non-superuser, same as above on NTFS. */ - sfs->f_bavail = sfs->f_bfree = size; - /* Serialize accesses to the inode bitmap. */ - down_read(&vol->mftbmp_lock); - read_lock_irqsave(&mft_ni->size_lock, flags); - size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; - /* - * Convert the maximum number of set bits into bytes rounded up, then - * convert into multiples of PAGE_SIZE, rounding up so that if we - * have one full and one partial page max_index = 2. - */ - max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) - + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT; - read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Number of inodes in filesystem (at this point in time). */ - sfs->f_files = size; - /* Free inodes in fs (based on current total count). */ - sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index); - up_read(&vol->mftbmp_lock); - /* - * File system id. This is extremely *nix flavour dependent and even - * within Linux itself all fs do their own thing. I interpret this to - * mean a unique id associated with the mounted fs and not the id - * associated with the filesystem driver, the latter is already given - * by the filesystem type in sfs->f_type. Thus we use the 64-bit - * volume serial number splitting it into two 32-bit parts. We enter - * the least significant 32-bits in f_fsid[0] and the most significant - * 32-bits in f_fsid[1]. - */ - sfs->f_fsid = u64_to_fsid(vol->serial_no); - /* Maximum length of filenames. */ - sfs->f_namelen = NTFS_MAX_NAME_LEN; - return 0; -} - -#ifdef NTFS_RW -static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) -{ - return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); -} -#endif - -/* - * The complete super operations. - */ -static const struct super_operations ntfs_sops = { - .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ - .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */ -#ifdef NTFS_RW - .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to - disk. */ -#endif /* NTFS_RW */ - .put_super = ntfs_put_super, /* Syscall: umount. */ - .statfs = ntfs_statfs, /* Syscall: statfs */ - .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ - .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is - removed from memory. */ - .show_options = ntfs_show_options, /* Show mount options in - proc. */ -}; - -/** - * ntfs_fill_super - mount an ntfs filesystem - * @sb: super block of ntfs filesystem to mount - * @opt: string containing the mount options - * @silent: silence error output - * - * ntfs_fill_super() is called by the VFS to mount the device described by @sb - * with the mount otions in @data with the NTFS filesystem. - * - * If @silent is true, remain silent even if errors are detected. This is used - * during bootup, when the kernel tries to mount the root filesystem with all - * registered filesystems one after the other until one succeeds. This implies - * that all filesystems except the correct one will quite correctly and - * expectedly return an error, but nobody wants to see error messages when in - * fact this is what is supposed to happen. - * - * NOTE: @sb->s_flags contains the mount options flags. - */ -static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) -{ - ntfs_volume *vol; - struct buffer_head *bh; - struct inode *tmp_ino; - int blocksize, result; - - /* - * We do a pretty difficult piece of bootstrap by reading the - * MFT (and other metadata) from disk into memory. We'll only - * release this metadata during umount, so the locking patterns - * observed during bootstrap do not count. So turn off the - * observation of locking patterns (strictly for this context - * only) while mounting NTFS. [The validator is still active - * otherwise, even for this context: it will for example record - * lock class registrations.] - */ - lockdep_off(); - ntfs_debug("Entering."); -#ifndef NTFS_RW - sb->s_flags |= SB_RDONLY; -#endif /* ! NTFS_RW */ - /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ - sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); - vol = NTFS_SB(sb); - if (!vol) { - if (!silent) - ntfs_error(sb, "Allocation of NTFS volume structure " - "failed. Aborting mount..."); - lockdep_on(); - return -ENOMEM; - } - /* Initialize ntfs_volume structure. */ - *vol = (ntfs_volume) { - .sb = sb, - /* - * Default is group and other don't have any access to files or - * directories while owner has full access. Further, files by - * default are not executable but directories are of course - * browseable. - */ - .fmask = 0177, - .dmask = 0077, - }; - init_rwsem(&vol->mftbmp_lock); - init_rwsem(&vol->lcnbmp_lock); - - /* By default, enable sparse support. */ - NVolSetSparseEnabled(vol); - - /* Important to get the mount options dealt with now. */ - if (!parse_options(vol, (char*)opt)) - goto err_out_now; - - /* We support sector sizes up to the PAGE_SIZE. */ - if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) { - if (!silent) - ntfs_error(sb, "Device has unsupported sector size " - "(%i). The maximum supported sector " - "size on this architecture is %lu " - "bytes.", - bdev_logical_block_size(sb->s_bdev), - PAGE_SIZE); - goto err_out_now; - } - /* - * Setup the device access block size to NTFS_BLOCK_SIZE or the hard - * sector size, whichever is bigger. - */ - blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE); - if (blocksize < NTFS_BLOCK_SIZE) { - if (!silent) - ntfs_error(sb, "Unable to set device block size."); - goto err_out_now; - } - BUG_ON(blocksize != sb->s_blocksize); - ntfs_debug("Set device block size to %i bytes (block size bits %i).", - blocksize, sb->s_blocksize_bits); - /* Determine the size of the device in units of block_size bytes. */ - vol->nr_blocks = sb_bdev_nr_blocks(sb); - if (!vol->nr_blocks) { - if (!silent) - ntfs_error(sb, "Unable to determine device size."); - goto err_out_now; - } - /* Read the boot sector and return unlocked buffer head to it. */ - if (!(bh = read_ntfs_boot_sector(sb, silent))) { - if (!silent) - ntfs_error(sb, "Not an NTFS volume."); - goto err_out_now; - } - /* - * Extract the data from the boot sector and setup the ntfs volume - * using it. - */ - result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); - brelse(bh); - if (!result) { - if (!silent) - ntfs_error(sb, "Unsupported NTFS filesystem."); - goto err_out_now; - } - /* - * If the boot sector indicates a sector size bigger than the current - * device block size, switch the device block size to the sector size. - * TODO: It may be possible to support this case even when the set - * below fails, we would just be breaking up the i/o for each sector - * into multiple blocks for i/o purposes but otherwise it should just - * work. However it is safer to leave disabled until someone hits this - * error message and then we can get them to try it without the setting - * so we know for sure that it works. - */ - if (vol->sector_size > blocksize) { - blocksize = sb_set_blocksize(sb, vol->sector_size); - if (blocksize != vol->sector_size) { - if (!silent) - ntfs_error(sb, "Unable to set device block " - "size to sector size (%i).", - vol->sector_size); - goto err_out_now; - } - BUG_ON(blocksize != sb->s_blocksize); - vol->nr_blocks = sb_bdev_nr_blocks(sb); - ntfs_debug("Changed device block size to %i bytes (block size " - "bits %i) to match volume sector size.", - blocksize, sb->s_blocksize_bits); - } - /* Initialize the cluster and mft allocators. */ - ntfs_setup_allocators(vol); - /* Setup remaining fields in the super block. */ - sb->s_magic = NTFS_SB_MAGIC; - /* - * Ntfs allows 63 bits for the file size, i.e. correct would be: - * sb->s_maxbytes = ~0ULL >> 1; - * But the kernel uses a long as the page cache page index which on - * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel - * defined to the maximum the page cache page index can cope with - * without overflowing the index or to 2^63 - 1, whichever is smaller. - */ - sb->s_maxbytes = MAX_LFS_FILESIZE; - /* Ntfs measures time in 100ns intervals. */ - sb->s_time_gran = 100; - /* - * Now load the metadata required for the page cache and our address - * space operations to function. We do this by setting up a specialised - * read_inode method and then just calling the normal iget() to obtain - * the inode for $MFT which is sufficient to allow our normal inode - * operations and associated address space operations to function. - */ - sb->s_op = &ntfs_sops; - tmp_ino = new_inode(sb); - if (!tmp_ino) { - if (!silent) - ntfs_error(sb, "Failed to load essential metadata."); - goto err_out_now; - } - tmp_ino->i_ino = FILE_MFT; - insert_inode_hash(tmp_ino); - if (ntfs_read_inode_mount(tmp_ino) < 0) { - if (!silent) - ntfs_error(sb, "Failed to load essential metadata."); - goto iput_tmp_ino_err_out_now; - } - mutex_lock(&ntfs_lock); - /* - * The current mount is a compression user if the cluster size is - * less than or equal 4kiB. - */ - if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) { - result = allocate_compression_buffers(); - if (result) { - ntfs_error(NULL, "Failed to allocate buffers " - "for compression engine."); - ntfs_nr_compression_users--; - mutex_unlock(&ntfs_lock); - goto iput_tmp_ino_err_out_now; - } - } - /* - * Generate the global default upcase table if necessary. Also - * temporarily increment the number of upcase users to avoid race - * conditions with concurrent (u)mounts. - */ - if (!default_upcase) - default_upcase = generate_default_upcase(); - ntfs_nr_upcase_users++; - mutex_unlock(&ntfs_lock); - /* - * From now on, ignore @silent parameter. If we fail below this line, - * it will be due to a corrupt fs or a system error, so we report it. - */ - /* - * Open the system files with normal access functions and complete - * setting up the ntfs super block. - */ - if (!load_system_files(vol)) { - ntfs_error(sb, "Failed to load system files."); - goto unl_upcase_iput_tmp_ino_err_out_now; - } - - /* We grab a reference, simulating an ntfs_iget(). */ - ihold(vol->root_ino); - if ((sb->s_root = d_make_root(vol->root_ino))) { - ntfs_debug("Exiting, status successful."); - /* Release the default upcase if it has no users. */ - mutex_lock(&ntfs_lock); - if (!--ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - mutex_unlock(&ntfs_lock); - sb->s_export_op = &ntfs_export_ops; - lockdep_on(); - return 0; - } - ntfs_error(sb, "Failed to allocate root directory."); - /* Clean up after the successful load_system_files() call from above. */ - // TODO: Use ntfs_put_super() instead of repeating all this code... - // FIXME: Should mark the volume clean as the error is most likely - // -ENOMEM. - iput(vol->vol_ino); - vol->vol_ino = NULL; - /* NTFS 3.0+ specific clean up. */ - if (vol->major_ver >= 3) { -#ifdef NTFS_RW - if (vol->usnjrnl_j_ino) { - iput(vol->usnjrnl_j_ino); - vol->usnjrnl_j_ino = NULL; - } - if (vol->usnjrnl_max_ino) { - iput(vol->usnjrnl_max_ino); - vol->usnjrnl_max_ino = NULL; - } - if (vol->usnjrnl_ino) { - iput(vol->usnjrnl_ino); - vol->usnjrnl_ino = NULL; - } - if (vol->quota_q_ino) { - iput(vol->quota_q_ino); - vol->quota_q_ino = NULL; - } - if (vol->quota_ino) { - iput(vol->quota_ino); - vol->quota_ino = NULL; - } -#endif /* NTFS_RW */ - if (vol->extend_ino) { - iput(vol->extend_ino); - vol->extend_ino = NULL; - } - if (vol->secure_ino) { - iput(vol->secure_ino); - vol->secure_ino = NULL; - } - } - iput(vol->root_ino); - vol->root_ino = NULL; - iput(vol->lcnbmp_ino); - vol->lcnbmp_ino = NULL; - iput(vol->mftbmp_ino); - vol->mftbmp_ino = NULL; -#ifdef NTFS_RW - if (vol->logfile_ino) { - iput(vol->logfile_ino); - vol->logfile_ino = NULL; - } - if (vol->mftmirr_ino) { - iput(vol->mftmirr_ino); - vol->mftmirr_ino = NULL; - } -#endif /* NTFS_RW */ - /* Throw away the table of attribute definitions. */ - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } - vol->upcase_len = 0; - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; - } - if (vol->nls_map) { - unload_nls(vol->nls_map); - vol->nls_map = NULL; - } - /* Error exit code path. */ -unl_upcase_iput_tmp_ino_err_out_now: - /* - * Decrease the number of upcase users and destroy the global default - * upcase table if necessary. - */ - mutex_lock(&ntfs_lock); - if (!--ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) - free_compression_buffers(); - mutex_unlock(&ntfs_lock); -iput_tmp_ino_err_out_now: - iput(tmp_ino); - if (vol->mft_ino && vol->mft_ino != tmp_ino) - iput(vol->mft_ino); - vol->mft_ino = NULL; - /* Errors at this stage are irrelevant. */ -err_out_now: - sb->s_fs_info = NULL; - kfree(vol); - ntfs_debug("Failed, returning -EINVAL."); - lockdep_on(); - return -EINVAL; -} - -/* - * This is a slab cache to optimize allocations and deallocations of Unicode - * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN - * (255) Unicode characters + a terminating NULL Unicode character. - */ -struct kmem_cache *ntfs_name_cache; - -/* Slab caches for efficient allocation/deallocation of inodes. */ -struct kmem_cache *ntfs_inode_cache; -struct kmem_cache *ntfs_big_inode_cache; - -/* Init once constructor for the inode slab cache. */ -static void ntfs_big_inode_init_once(void *foo) -{ - ntfs_inode *ni = (ntfs_inode *)foo; - - inode_init_once(VFS_I(ni)); -} - -/* - * Slab caches to optimize allocations and deallocations of attribute search - * contexts and index contexts, respectively. - */ -struct kmem_cache *ntfs_attr_ctx_cache; -struct kmem_cache *ntfs_index_ctx_cache; - -/* Driver wide mutex. */ -DEFINE_MUTEX(ntfs_lock); - -static struct dentry *ntfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); -} - -static struct file_system_type ntfs_fs_type = { - .owner = THIS_MODULE, - .name = "ntfs", - .mount = ntfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("ntfs"); - -/* Stable names for the slab caches. */ -static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; -static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache"; -static const char ntfs_name_cache_name[] = "ntfs_name_cache"; -static const char ntfs_inode_cache_name[] = "ntfs_inode_cache"; -static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache"; - -static int __init init_ntfs_fs(void) -{ - int err = 0; - - /* This may be ugly but it results in pretty output so who cares. (-8 */ - pr_info("driver " NTFS_VERSION " [Flags: R/" -#ifdef NTFS_RW - "W" -#else - "O" -#endif -#ifdef DEBUG - " DEBUG" -#endif -#ifdef MODULE - " MODULE" -#endif - "].\n"); - - ntfs_debug("Debug messages are enabled."); - - ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name, - sizeof(ntfs_index_context), 0 /* offset */, - SLAB_HWCACHE_ALIGN, NULL /* ctor */); - if (!ntfs_index_ctx_cache) { - pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); - goto ictx_err_out; - } - ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, - sizeof(ntfs_attr_search_ctx), 0 /* offset */, - SLAB_HWCACHE_ALIGN, NULL /* ctor */); - if (!ntfs_attr_ctx_cache) { - pr_crit("NTFS: Failed to create %s!\n", - ntfs_attr_ctx_cache_name); - goto actx_err_out; - } - - ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name, - (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ntfs_name_cache) { - pr_crit("Failed to create %s!\n", ntfs_name_cache_name); - goto name_err_out; - } - - ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name, - sizeof(ntfs_inode), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - if (!ntfs_inode_cache) { - pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); - goto inode_err_out; - } - - ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, - sizeof(big_ntfs_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, ntfs_big_inode_init_once); - if (!ntfs_big_inode_cache) { - pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); - goto big_inode_err_out; - } - - /* Register the ntfs sysctls. */ - err = ntfs_sysctl(1); - if (err) { - pr_crit("Failed to register NTFS sysctls!\n"); - goto sysctl_err_out; - } - - err = register_filesystem(&ntfs_fs_type); - if (!err) { - ntfs_debug("NTFS driver registered successfully."); - return 0; /* Success! */ - } - pr_crit("Failed to register NTFS filesystem driver!\n"); - - /* Unregister the ntfs sysctls. */ - ntfs_sysctl(0); -sysctl_err_out: - kmem_cache_destroy(ntfs_big_inode_cache); -big_inode_err_out: - kmem_cache_destroy(ntfs_inode_cache); -inode_err_out: - kmem_cache_destroy(ntfs_name_cache); -name_err_out: - kmem_cache_destroy(ntfs_attr_ctx_cache); -actx_err_out: - kmem_cache_destroy(ntfs_index_ctx_cache); -ictx_err_out: - if (!err) { - pr_crit("Aborting NTFS filesystem driver registration...\n"); - err = -ENOMEM; - } - return err; -} - -static void __exit exit_ntfs_fs(void) -{ - ntfs_debug("Unregistering NTFS driver."); - - unregister_filesystem(&ntfs_fs_type); - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(ntfs_big_inode_cache); - kmem_cache_destroy(ntfs_inode_cache); - kmem_cache_destroy(ntfs_name_cache); - kmem_cache_destroy(ntfs_attr_ctx_cache); - kmem_cache_destroy(ntfs_index_ctx_cache); - /* Unregister the ntfs sysctls. */ - ntfs_sysctl(0); -} - -MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); -MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); -MODULE_VERSION(NTFS_VERSION); -MODULE_LICENSE("GPL"); -#ifdef DEBUG -module_param(debug_msgs, bint, 0); -MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); -#endif - -module_init(init_ntfs_fs) -module_exit(exit_ntfs_fs) diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c deleted file mode 100644 index 4e980170d86a..000000000000 --- a/fs/ntfs/sysctl.c +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. Adapted from the old NTFS driver, - * Copyright (C) 1997 Martin von Löwis, Régis Duchesne - * - * Copyright (c) 2002-2005 Anton Altaparmakov - */ - -#ifdef DEBUG - -#include <linux/module.h> - -#ifdef CONFIG_SYSCTL - -#include <linux/proc_fs.h> -#include <linux/sysctl.h> - -#include "sysctl.h" -#include "debug.h" - -/* Definition of the ntfs sysctl. */ -static struct ctl_table ntfs_sysctls[] = { - { - .procname = "ntfs-debug", - .data = &debug_msgs, /* Data pointer and size. */ - .maxlen = sizeof(debug_msgs), - .mode = 0644, /* Mode, proc handler. */ - .proc_handler = proc_dointvec - }, -}; - -/* Storage for the sysctls header. */ -static struct ctl_table_header *sysctls_root_table; - -/** - * ntfs_sysctl - add or remove the debug sysctl - * @add: add (1) or remove (0) the sysctl - * - * Add or remove the debug sysctl. Return 0 on success or -errno on error. - */ -int ntfs_sysctl(int add) -{ - if (add) { - BUG_ON(sysctls_root_table); - sysctls_root_table = register_sysctl("fs", ntfs_sysctls); - if (!sysctls_root_table) - return -ENOMEM; - } else { - BUG_ON(!sysctls_root_table); - unregister_sysctl_table(sysctls_root_table); - sysctls_root_table = NULL; - } - return 0; -} - -#endif /* CONFIG_SYSCTL */ -#endif /* DEBUG */ diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h deleted file mode 100644 index 96bb2299d2d5..000000000000 --- a/fs/ntfs/sysctl.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. Adapted from the old NTFS driver, - * Copyright (C) 1997 Martin von Löwis, Régis Duchesne - * - * Copyright (c) 2002-2004 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_SYSCTL_H -#define _LINUX_NTFS_SYSCTL_H - - -#if defined(DEBUG) && defined(CONFIG_SYSCTL) - -extern int ntfs_sysctl(int add); - -#else - -/* Just return success. */ -static inline int ntfs_sysctl(int add) -{ - return 0; -} - -#endif /* DEBUG && CONFIG_SYSCTL */ -#endif /* _LINUX_NTFS_SYSCTL_H */ diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h deleted file mode 100644 index 6b63261300cc..000000000000 --- a/fs/ntfs/time.h +++ /dev/null @@ -1,89 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * time.h - NTFS time conversion functions. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_TIME_H -#define _LINUX_NTFS_TIME_H - -#include <linux/time.h> /* For current_kernel_time(). */ -#include <asm/div64.h> /* For do_div(). */ - -#include "endian.h" - -#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000) - -/** - * utc2ntfs - convert Linux UTC time to NTFS time - * @ts: Linux UTC time to convert to NTFS time - * - * Convert the Linux UTC time @ts to its corresponding NTFS time and return - * that in little endian format. - * - * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec - * and a long tv_nsec where tv_sec is the number of 1-second intervals since - * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second - * intervals since the value of tv_sec. - * - * NTFS uses Microsoft's standard time format which is stored in a s64 and is - * measured as the number of 100-nano-second intervals since 1st January 1601, - * 00:00:00 UTC. - */ -static inline sle64 utc2ntfs(const struct timespec64 ts) -{ - /* - * Convert the seconds to 100ns intervals, add the nano-seconds - * converted to 100ns intervals, and then add the NTFS time offset. - */ - return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 + - NTFS_TIME_OFFSET); -} - -/** - * get_current_ntfs_time - get the current time in little endian NTFS format - * - * Get the current time from the Linux kernel, convert it to its corresponding - * NTFS time and return that in little endian format. - */ -static inline sle64 get_current_ntfs_time(void) -{ - struct timespec64 ts; - - ktime_get_coarse_real_ts64(&ts); - return utc2ntfs(ts); -} - -/** - * ntfs2utc - convert NTFS time to Linux time - * @time: NTFS time (little endian) to convert to Linux UTC - * - * Convert the little endian NTFS time @time to its corresponding Linux UTC - * time and return that in cpu format. - * - * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec - * and a long tv_nsec where tv_sec is the number of 1-second intervals since - * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second - * intervals since the value of tv_sec. - * - * NTFS uses Microsoft's standard time format which is stored in a s64 and is - * measured as the number of 100 nano-second intervals since 1st January 1601, - * 00:00:00 UTC. - */ -static inline struct timespec64 ntfs2utc(const sle64 time) -{ - struct timespec64 ts; - - /* Subtract the NTFS time offset. */ - u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); - /* - * Convert the time to 1-second intervals and the remainder to - * 1-nano-second intervals. - */ - ts.tv_nsec = do_div(t, 10000000) * 100; - ts.tv_sec = t; - return ts; -} - -#endif /* _LINUX_NTFS_TIME_H */ diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h deleted file mode 100644 index 9a47859e7a06..000000000000 --- a/fs/ntfs/types.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * types.h - Defines for NTFS Linux kernel driver specific types. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_TYPES_H -#define _LINUX_NTFS_TYPES_H - -#include <linux/types.h> - -typedef __le16 le16; -typedef __le32 le32; -typedef __le64 le64; -typedef __u16 __bitwise sle16; -typedef __u32 __bitwise sle32; -typedef __u64 __bitwise sle64; - -/* 2-byte Unicode character type. */ -typedef le16 ntfschar; -#define UCHAR_T_SIZE_BITS 1 - -/* - * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN - * and VCN, to allow for type checking and better code readability. - */ -typedef s64 VCN; -typedef sle64 leVCN; -typedef s64 LCN; -typedef sle64 leLCN; - -/* - * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit - * values. We define our own type LSN, to allow for type checking and better - * code readability. - */ -typedef s64 LSN; -typedef sle64 leLSN; - -/* - * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values. - * We define our own type USN, to allow for type checking and better code - * readability. - */ -typedef s64 USN; -typedef sle64 leUSN; - -typedef enum { - CASE_SENSITIVE = 0, - IGNORE_CASE = 1, -} IGNORE_CASE_BOOL; - -#endif /* _LINUX_NTFS_TYPES_H */ diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c deleted file mode 100644 index a6b6c64f14a9..000000000000 --- a/fs/ntfs/unistr.c +++ /dev/null @@ -1,384 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project. - * - * Copyright (c) 2001-2006 Anton Altaparmakov - */ - -#include <linux/slab.h> - -#include "types.h" -#include "debug.h" -#include "ntfs.h" - -/* - * IMPORTANT - * ========= - * - * All these routines assume that the Unicode characters are in little endian - * encoding inside the strings!!! - */ - -/* - * This is used by the name collation functions to quickly determine what - * characters are (in)valid. - */ -static const u8 legal_ansi_char_array[0x40] = { - 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - - 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, - - 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, - 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, - - 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, - 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, -}; - -/** - * ntfs_are_names_equal - compare two Unicode names for equality - * @s1: name to compare to @s2 - * @s1_len: length in Unicode characters of @s1 - * @s2: name to compare to @s1 - * @s2_len: length in Unicode characters of @s2 - * @ic: ignore case bool - * @upcase: upcase table (only if @ic == IGNORE_CASE) - * @upcase_size: length in Unicode characters of @upcase (if present) - * - * Compare the names @s1 and @s2 and return 'true' (1) if the names are - * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE, - * the @upcase table is used to performa a case insensitive comparison. - */ -bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, - const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_size) -{ - if (s1_len != s2_len) - return false; - if (ic == CASE_SENSITIVE) - return !ntfs_ucsncmp(s1, s2, s1_len); - return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size); -} - -/** - * ntfs_collate_names - collate two Unicode names - * @name1: first Unicode name to compare - * @name2: second Unicode name to compare - * @err_val: if @name1 contains an invalid character return this value - * @ic: either CASE_SENSITIVE or IGNORE_CASE - * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) - * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) - * - * ntfs_collate_names collates two Unicode names and returns: - * - * -1 if the first name collates before the second one, - * 0 if the names match, - * 1 if the second name collates before the first one, or - * @err_val if an invalid character is found in @name1 during the comparison. - * - * The following characters are considered invalid: '"', '*', '<', '>' and '?'. - */ -int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, - const ntfschar *name2, const u32 name2_len, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len) -{ - u32 cnt, min_len; - u16 c1, c2; - - min_len = name1_len; - if (name1_len > name2_len) - min_len = name2_len; - for (cnt = 0; cnt < min_len; ++cnt) { - c1 = le16_to_cpu(*name1++); - c2 = le16_to_cpu(*name2++); - if (ic) { - if (c1 < upcase_len) - c1 = le16_to_cpu(upcase[c1]); - if (c2 < upcase_len) - c2 = le16_to_cpu(upcase[c2]); - } - if (c1 < 64 && legal_ansi_char_array[c1] & 8) - return err_val; - if (c1 < c2) - return -1; - if (c1 > c2) - return 1; - } - if (name1_len < name2_len) - return -1; - if (name1_len == name2_len) - return 0; - /* name1_len > name2_len */ - c1 = le16_to_cpu(*name1); - if (c1 < 64 && legal_ansi_char_array[c1] & 8) - return err_val; - return 1; -} - -/** - * ntfs_ucsncmp - compare two little endian Unicode strings - * @s1: first string - * @s2: second string - * @n: maximum unicode characters to compare - * - * Compare the first @n characters of the Unicode strings @s1 and @s2, - * The strings in little endian format and appropriate le16_to_cpu() - * conversion is performed on non-little endian machines. - * - * The function returns an integer less than, equal to, or greater than zero - * if @s1 (or the first @n Unicode characters thereof) is found, respectively, - * to be less than, to match, or be greater than @s2. - */ -int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) -{ - u16 c1, c2; - size_t i; - - for (i = 0; i < n; ++i) { - c1 = le16_to_cpu(s1[i]); - c2 = le16_to_cpu(s2[i]); - if (c1 < c2) - return -1; - if (c1 > c2) - return 1; - if (!c1) - break; - } - return 0; -} - -/** - * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case - * @s1: first string - * @s2: second string - * @n: maximum unicode characters to compare - * @upcase: upcase table - * @upcase_size: upcase table size in Unicode characters - * - * Compare the first @n characters of the Unicode strings @s1 and @s2, - * ignoring case. The strings in little endian format and appropriate - * le16_to_cpu() conversion is performed on non-little endian machines. - * - * Each character is uppercased using the @upcase table before the comparison. - * - * The function returns an integer less than, equal to, or greater than zero - * if @s1 (or the first @n Unicode characters thereof) is found, respectively, - * to be less than, to match, or be greater than @s2. - */ -int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, - const ntfschar *upcase, const u32 upcase_size) -{ - size_t i; - u16 c1, c2; - - for (i = 0; i < n; ++i) { - if ((c1 = le16_to_cpu(s1[i])) < upcase_size) - c1 = le16_to_cpu(upcase[c1]); - if ((c2 = le16_to_cpu(s2[i])) < upcase_size) - c2 = le16_to_cpu(upcase[c2]); - if (c1 < c2) - return -1; - if (c1 > c2) - return 1; - if (!c1) - break; - } - return 0; -} - -void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase, - const u32 upcase_len) -{ - u32 i; - u16 u; - - for (i = 0; i < name_len; i++) - if ((u = le16_to_cpu(name[i])) < upcase_len) - name[i] = upcase[u]; -} - -void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, - const ntfschar *upcase, const u32 upcase_len) -{ - ntfs_upcase_name((ntfschar*)&file_name_attr->file_name, - file_name_attr->file_name_length, upcase, upcase_len); -} - -int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, - FILE_NAME_ATTR *file_name_attr2, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len) -{ - return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name, - file_name_attr1->file_name_length, - (ntfschar*)&file_name_attr2->file_name, - file_name_attr2->file_name_length, - err_val, ic, upcase, upcase_len); -} - -/** - * ntfs_nlstoucs - convert NLS string to little endian Unicode string - * @vol: ntfs volume which we are working with - * @ins: input NLS string buffer - * @ins_len: length of input string in bytes - * @outs: on return contains the allocated output Unicode string buffer - * - * Convert the input string @ins, which is in whatever format the loaded NLS - * map dictates, into a little endian, 2-byte Unicode string. - * - * This function allocates the string and the caller is responsible for - * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it. - * - * On success the function returns the number of Unicode characters written to - * the output string *@outs (>= 0), not counting the terminating Unicode NULL - * character. *@outs is set to the allocated output string buffer. - * - * On error, a negative number corresponding to the error code is returned. In - * that case the output string is not allocated. Both *@outs and *@outs_len - * are then undefined. - * - * This might look a bit odd due to fast path optimization... - */ -int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, - const int ins_len, ntfschar **outs) -{ - struct nls_table *nls = vol->nls_map; - ntfschar *ucs; - wchar_t wc; - int i, o, wc_len; - - /* We do not trust outside sources. */ - if (likely(ins)) { - ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); - if (likely(ucs)) { - for (i = o = 0; i < ins_len; i += wc_len) { - wc_len = nls->char2uni(ins + i, ins_len - i, - &wc); - if (likely(wc_len >= 0 && - o < NTFS_MAX_NAME_LEN)) { - if (likely(wc)) { - ucs[o++] = cpu_to_le16(wc); - continue; - } /* else if (!wc) */ - break; - } /* else if (wc_len < 0 || - o >= NTFS_MAX_NAME_LEN) */ - goto name_err; - } - ucs[o] = 0; - *outs = ucs; - return o; - } /* else if (!ucs) */ - ntfs_error(vol->sb, "Failed to allocate buffer for converted " - "name from ntfs_name_cache."); - return -ENOMEM; - } /* else if (!ins) */ - ntfs_error(vol->sb, "Received NULL pointer."); - return -EINVAL; -name_err: - kmem_cache_free(ntfs_name_cache, ucs); - if (wc_len < 0) { - ntfs_error(vol->sb, "Name using character set %s contains " - "characters that cannot be converted to " - "Unicode.", nls->charset); - i = -EILSEQ; - } else /* if (o >= NTFS_MAX_NAME_LEN) */ { - ntfs_error(vol->sb, "Name is too long (maximum length for a " - "name on NTFS is %d Unicode characters.", - NTFS_MAX_NAME_LEN); - i = -ENAMETOOLONG; - } - return i; -} - -/** - * ntfs_ucstonls - convert little endian Unicode string to NLS string - * @vol: ntfs volume which we are working with - * @ins: input Unicode string buffer - * @ins_len: length of input string in Unicode characters - * @outs: on return contains the (allocated) output NLS string buffer - * @outs_len: length of output string buffer in bytes - * - * Convert the input little endian, 2-byte Unicode string @ins, of length - * @ins_len into the string format dictated by the loaded NLS. - * - * If *@outs is NULL, this function allocates the string and the caller is - * responsible for calling kfree(*@outs); when finished with it. In this case - * @outs_len is ignored and can be 0. - * - * On success the function returns the number of bytes written to the output - * string *@outs (>= 0), not counting the terminating NULL byte. If the output - * string buffer was allocated, *@outs is set to it. - * - * On error, a negative number corresponding to the error code is returned. In - * that case the output string is not allocated. The contents of *@outs are - * then undefined. - * - * This might look a bit odd due to fast path optimization... - */ -int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, - const int ins_len, unsigned char **outs, int outs_len) -{ - struct nls_table *nls = vol->nls_map; - unsigned char *ns; - int i, o, ns_len, wc; - - /* We don't trust outside sources. */ - if (ins) { - ns = *outs; - ns_len = outs_len; - if (ns && !ns_len) { - wc = -ENAMETOOLONG; - goto conversion_err; - } - if (!ns) { - ns_len = ins_len * NLS_MAX_CHARSET_SIZE; - ns = kmalloc(ns_len + 1, GFP_NOFS); - if (!ns) - goto mem_err_out; - } - for (i = o = 0; i < ins_len; i++) { -retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, - ns_len - o); - if (wc > 0) { - o += wc; - continue; - } else if (!wc) - break; - else if (wc == -ENAMETOOLONG && ns != *outs) { - unsigned char *tc; - /* Grow in multiples of 64 bytes. */ - tc = kmalloc((ns_len + 64) & - ~63, GFP_NOFS); - if (tc) { - memcpy(tc, ns, ns_len); - ns_len = ((ns_len + 64) & ~63) - 1; - kfree(ns); - ns = tc; - goto retry; - } /* No memory so goto conversion_error; */ - } /* wc < 0, real error. */ - goto conversion_err; - } - ns[o] = 0; - *outs = ns; - return o; - } /* else (!ins) */ - ntfs_error(vol->sb, "Received NULL pointer."); - return -EINVAL; -conversion_err: - ntfs_error(vol->sb, "Unicode name contains characters that cannot be " - "converted to character set %s. You might want to " - "try to use the mount option nls=utf8.", nls->charset); - if (ns != *outs) - kfree(ns); - if (wc != -ENAMETOOLONG) - wc = -EILSEQ; - return wc; -mem_err_out: - ntfs_error(vol->sb, "Failed to allocate name!"); - return -ENOMEM; -} diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c deleted file mode 100644 index 4ebe84a78dea..000000000000 --- a/fs/ntfs/upcase.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * upcase.c - Generate the full NTFS Unicode upcase table in little endian. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org> - * Copyright (c) 2001-2006 Anton Altaparmakov - */ - -#include "malloc.h" -#include "ntfs.h" - -ntfschar *generate_default_upcase(void) -{ - static const int uc_run_table[][3] = { /* Start, End, Add */ - {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, - {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, - {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, - {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, - {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, - {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, - {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, - {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, - {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, - {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, - {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, - {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, - {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, - {0} - }; - - static const int uc_dup_table[][2] = { /* Start, End */ - {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, - {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, - {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, - {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, - {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, - {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, - {0} - }; - - static const int uc_word_table[][2] = { /* Offset, Value */ - {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, - {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, - {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, - {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, - {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, - {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, - {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, - {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, - {0} - }; - - int i, r; - ntfschar *uc; - - uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar)); - if (!uc) - return uc; - memset(uc, 0, default_upcase_len * sizeof(ntfschar)); - /* Generate the little endian Unicode upcase table used by ntfs. */ - for (i = 0; i < default_upcase_len; i++) - uc[i] = cpu_to_le16(i); - for (r = 0; uc_run_table[r][0]; r++) - for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) - le16_add_cpu(&uc[i], uc_run_table[r][2]); - for (r = 0; uc_dup_table[r][0]; r++) - for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) - le16_add_cpu(&uc[i + 1], -1); - for (r = 0; uc_word_table[r][0]; r++) - uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]); - return uc; -} diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c deleted file mode 100644 index 9097a0b4ef25..000000000000 --- a/fs/ntfs/usnjrnl.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the - * Linux-NTFS project. - * - * Copyright (c) 2005 Anton Altaparmakov - */ - -#ifdef NTFS_RW - -#include <linux/fs.h> -#include <linux/highmem.h> -#include <linux/mm.h> - -#include "aops.h" -#include "debug.h" -#include "endian.h" -#include "time.h" -#include "types.h" -#include "usnjrnl.h" -#include "volume.h" - -/** - * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume - * @vol: ntfs volume on which to stamp the transaction log - * - * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return - * 'true' on success and 'false' on error. - * - * This function assumes that the transaction log has already been loaded and - * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl(). - */ -bool ntfs_stamp_usnjrnl(ntfs_volume *vol) -{ - ntfs_debug("Entering."); - if (likely(!NVolUsnJrnlStamped(vol))) { - sle64 stamp; - struct page *page; - USN_HEADER *uh; - - page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from " - "$UsnJrnl/$DATA/$Max attribute."); - return false; - } - uh = (USN_HEADER*)page_address(page); - stamp = get_current_ntfs_time(); - ntfs_debug("Stamping transaction log ($UsnJrnl): old " - "journal_id 0x%llx, old lowest_valid_usn " - "0x%llx, new journal_id 0x%llx, new " - "lowest_valid_usn 0x%llx.", - (long long)sle64_to_cpu(uh->journal_id), - (long long)sle64_to_cpu(uh->lowest_valid_usn), - (long long)sle64_to_cpu(stamp), - i_size_read(vol->usnjrnl_j_ino)); - uh->lowest_valid_usn = - cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino)); - uh->journal_id = stamp; - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - /* Set the flag so we do not have to do it again on remount. */ - NVolSetUsnJrnlStamped(vol); - } - ntfs_debug("Done."); - return true; -} - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h deleted file mode 100644 index 85f531b59395..000000000000 --- a/fs/ntfs/usnjrnl.h +++ /dev/null @@ -1,191 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling. - * Part of the Linux-NTFS project. - * - * Copyright (c) 2005 Anton Altaparmakov - */ - -#ifndef _LINUX_NTFS_USNJRNL_H -#define _LINUX_NTFS_USNJRNL_H - -#ifdef NTFS_RW - -#include "types.h" -#include "endian.h" -#include "layout.h" -#include "volume.h" - -/* - * Transaction log ($UsnJrnl) organization: - * - * The transaction log records whenever a file is modified in any way. So for - * example it will record that file "blah" was written to at a particular time - * but not what was written. If will record that a file was deleted or - * created, that a file was truncated, etc. See below for all the reason - * codes used. - * - * The transaction log is in the $Extend directory which is in the root - * directory of each volume. If it is not present it means transaction - * logging is disabled. If it is present it means transaction logging is - * either enabled or in the process of being disabled in which case we can - * ignore it as it will go away as soon as Windows gets its hands on it. - * - * To determine whether the transaction logging is enabled or in the process - * of being disabled, need to check the volume flags in the - * $VOLUME_INFORMATION attribute in the $Volume system file (which is present - * in the root directory and has a fixed mft record number, see layout.h). - * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log - * is in the process of being disabled and if this flag is clear it means the - * transaction log is enabled. - * - * The transaction log consists of two parts; the $DATA/$Max attribute as well - * as the $DATA/$J attribute. $Max is a header describing the transaction - * log whilst $J is the transaction log data itself as a sequence of variable - * sized USN_RECORDs (see below for all the structures). - * - * We do not care about transaction logging at this point in time but we still - * need to let windows know that the transaction log is out of date. To do - * this we need to stamp the transaction log. This involves setting the - * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used - * for the next added USN_RECORD to the $DATA/$J attribute as well as - * generating a new journal_id in $DATA/$Max. - * - * The journal_id is as of the current version (2.0) of the transaction log - * simply the 64-bit timestamp of when the journal was either created or last - * stamped. - * - * To determine the next usn there are two ways. The first is to parse - * $DATA/$J and to find the last USN_RECORD in it and to add its record_length - * to its usn (which is the byte offset in the $DATA/$J attribute). The - * second is simply to take the data size of the attribute. Since the usns - * are simply byte offsets into $DATA/$J, this is exactly the next usn. For - * obvious reasons we use the second method as it is much simpler and faster. - * - * As an aside, note that to actually disable the transaction log, one would - * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go - * through all the mft records on the volume and set the usn field in their - * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need - * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally, - * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag. - * - * Note that if a volume is unmounted whilst the transaction log is being - * disabled, the process will continue the next time the volume is mounted. - * This is why we can safely mount read-write when we see a transaction log - * in the process of being deleted. - */ - -/* Some $UsnJrnl related constants. */ -#define UsnJrnlMajorVer 2 -#define UsnJrnlMinorVer 0 - -/* - * $DATA/$Max attribute. This is (always?) resident and has a fixed size of - * 32 bytes. It contains the header describing the transaction log. - */ -typedef struct { -/*Ofs*/ -/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J - attribute. */ -/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the - size of the $DATA/$J attribute. */ -/*0x10*/sle64 journal_id; /* Current id of the transaction log. */ -/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the - current journal_id. */ -/* sizeof() = 32 (0x20) bytes */ -} __attribute__ ((__packed__)) USN_HEADER; - -/* - * Reason flags (32-bit). Cumulative flags describing the change(s) to the - * file since it was last opened. I think the names speak for themselves but - * if you disagree check out the descriptions in the Linux NTFS project NTFS - * documentation: http://www.linux-ntfs.org/ - */ -enum { - USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001), - USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002), - USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004), - USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010), - USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020), - USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040), - USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100), - USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200), - USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400), - USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800), - USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000), - USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000), - USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000), - USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000), - USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000), - USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000), - USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000), - USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000), - USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000), - USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000), - USN_REASON_CLOSE = cpu_to_le32(0x80000000), -}; - -typedef le32 USN_REASON_FLAGS; - -/* - * Source info flags (32-bit). Information about the source of the change(s) - * to the file. For detailed descriptions of what these mean, see the Linux - * NTFS project NTFS documentation: - * http://www.linux-ntfs.org/ - */ -enum { - USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001), - USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002), - USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004), -}; - -typedef le32 USN_SOURCE_INFO_FLAGS; - -/* - * $DATA/$J attribute. This is always non-resident, is marked as sparse, and - * is of variabled size. It consists of a sequence of variable size - * USN_RECORDS. The minimum allocated_size is allocation_delta as - * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is - * exceeded by more than allocation_delta bytes, allocation_delta bytes are - * allocated and appended to the $DATA/$J attribute and an equal number of - * bytes at the beginning of the attribute are freed and made sparse. Note the - * making sparse only happens at volume checkpoints and hence the actual - * $DATA/$J size can exceed maximum_size + allocation_delta temporarily. - */ -typedef struct { -/*Ofs*/ -/* 0*/le32 length; /* Byte size of this record (8-byte - aligned). */ -/* 4*/le16 major_ver; /* Major version of the transaction log used - for this record. */ -/* 6*/le16 minor_ver; /* Minor version of the transaction log used - for this record. */ -/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or - directory) described by this record. */ -/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent - directory of the file described by this - record. */ -/*0x18*/leUSN usn; /* The usn of this record. Equals the offset - within the $DATA/$J attribute. */ -/*0x20*/sle64 time; /* Time when this record was created. */ -/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */ -/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */ -/*0x30*/le32 security_id; /* File security_id copied from - $STANDARD_INFORMATION. */ -/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from - $STANDARD_INFORMATION or $FILE_NAME (not - sure which). */ -/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */ -/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the - start of this record. */ -/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use - file_name_offset to determine the location - of the name. */ -/* sizeof() = 60 (0x3c) bytes */ -} __attribute__ ((__packed__)) USN_RECORD; - -extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol); - -#endif /* NTFS_RW */ - -#endif /* _LINUX_NTFS_USNJRNL_H */ diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h deleted file mode 100644 index 930a9ae8a053..000000000000 --- a/fs/ntfs/volume.h +++ /dev/null @@ -1,164 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part - * of the Linux-NTFS project. - * - * Copyright (c) 2001-2006 Anton Altaparmakov - * Copyright (c) 2002 Richard Russon - */ - -#ifndef _LINUX_NTFS_VOLUME_H -#define _LINUX_NTFS_VOLUME_H - -#include <linux/rwsem.h> -#include <linux/uidgid.h> - -#include "types.h" -#include "layout.h" - -/* - * The NTFS in memory super block structure. - */ -typedef struct { - /* - * FIXME: Reorder to have commonly used together element within the - * same cache line, aiming at a cache line size of 32 bytes. Aim for - * 64 bytes for less commonly used together elements. Put most commonly - * used elements to front of structure. Obviously do this only when the - * structure has stabilized... (AIA) - */ - /* Device specifics. */ - struct super_block *sb; /* Pointer back to the super_block. */ - LCN nr_blocks; /* Number of sb->s_blocksize bytes - sized blocks on the device. */ - /* Configuration provided by user at mount time. */ - unsigned long flags; /* Miscellaneous flags, see below. */ - kuid_t uid; /* uid that files will be mounted as. */ - kgid_t gid; /* gid that files will be mounted as. */ - umode_t fmask; /* The mask for file permissions. */ - umode_t dmask; /* The mask for directory - permissions. */ - u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ - u8 on_errors; /* What to do on filesystem errors. */ - /* NTFS bootsector provided information. */ - u16 sector_size; /* in bytes */ - u8 sector_size_bits; /* log2(sector_size) */ - u32 cluster_size; /* in bytes */ - u32 cluster_size_mask; /* cluster_size - 1 */ - u8 cluster_size_bits; /* log2(cluster_size) */ - u32 mft_record_size; /* in bytes */ - u32 mft_record_size_mask; /* mft_record_size - 1 */ - u8 mft_record_size_bits; /* log2(mft_record_size) */ - u32 index_record_size; /* in bytes */ - u32 index_record_size_mask; /* index_record_size - 1 */ - u8 index_record_size_bits; /* log2(index_record_size) */ - LCN nr_clusters; /* Volume size in clusters == number of - bits in lcn bitmap. */ - LCN mft_lcn; /* Cluster location of mft data. */ - LCN mftmirr_lcn; /* Cluster location of copy of mft. */ - u64 serial_no; /* The volume serial number. */ - /* Mount specific NTFS information. */ - u32 upcase_len; /* Number of entries in upcase[]. */ - ntfschar *upcase; /* The upcase table. */ - - s32 attrdef_size; /* Size of the attribute definition - table in bytes. */ - ATTR_DEF *attrdef; /* Table of attribute definitions. - Obtained from FILE_AttrDef. */ - -#ifdef NTFS_RW - /* Variables used by the cluster and mft allocators. */ - s64 mft_data_pos; /* Mft record number at which to - allocate the next mft record. */ - LCN mft_zone_start; /* First cluster of the mft zone. */ - LCN mft_zone_end; /* First cluster beyond the mft zone. */ - LCN mft_zone_pos; /* Current position in the mft zone. */ - LCN data1_zone_pos; /* Current position in the first data - zone. */ - LCN data2_zone_pos; /* Current position in the second data - zone. */ -#endif /* NTFS_RW */ - - struct inode *mft_ino; /* The VFS inode of $MFT. */ - - struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */ - struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the - mft record bitmap ($MFT/$BITMAP). */ -#ifdef NTFS_RW - struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */ - int mftmirr_size; /* Size of mft mirror in mft records. */ - - struct inode *logfile_ino; /* The VFS inode of $LogFile. */ -#endif /* NTFS_RW */ - - struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */ - struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the - cluster bitmap ($Bitmap/$DATA). */ - - struct inode *vol_ino; /* The VFS inode of $Volume. */ - VOLUME_FLAGS vol_flags; /* Volume flags. */ - u8 major_ver; /* Ntfs major version of volume. */ - u8 minor_ver; /* Ntfs minor version of volume. */ - - struct inode *root_ino; /* The VFS inode of the root - directory. */ - struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+ - only, otherwise NULL). */ - struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+ - only, otherwise NULL). */ -#ifdef NTFS_RW - /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ - struct inode *quota_ino; /* The VFS inode of $Quota. */ - struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */ - /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ - struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */ - struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */ - struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */ -#endif /* NTFS_RW */ - struct nls_table *nls_map; -} ntfs_volume; - -/* - * Defined bits for the flags field in the ntfs_volume structure. - */ -typedef enum { - NV_Errors, /* 1: Volume has errors, prevent remount rw. */ - NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ - NV_CaseSensitive, /* 1: Treat file names as case sensitive and - create filenames in the POSIX namespace. - Otherwise be case insensitive but still - create file names in POSIX namespace. */ - NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ - NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ - NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ - NV_SparseEnabled, /* 1: May create sparse files. */ -} ntfs_volume_flags; - -/* - * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() - * functions. - */ -#define DEFINE_NVOL_BIT_OPS(flag) \ -static inline int NVol##flag(ntfs_volume *vol) \ -{ \ - return test_bit(NV_##flag, &(vol)->flags); \ -} \ -static inline void NVolSet##flag(ntfs_volume *vol) \ -{ \ - set_bit(NV_##flag, &(vol)->flags); \ -} \ -static inline void NVolClear##flag(ntfs_volume *vol) \ -{ \ - clear_bit(NV_##flag, &(vol)->flags); \ -} - -/* Emit the ntfs volume bitops functions. */ -DEFINE_NVOL_BIT_OPS(Errors) -DEFINE_NVOL_BIT_OPS(ShowSystemFiles) -DEFINE_NVOL_BIT_OPS(CaseSensitive) -DEFINE_NVOL_BIT_OPS(LogFileEmpty) -DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) -DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) -DEFINE_NVOL_BIT_OPS(SparseEnabled) - -#endif /* _LINUX_NTFS_VOLUME_H */ diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 63f70259edc0..7aadf5010999 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -886,7 +886,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi; u8 cluster_bits; - struct ATTRIB *attr = NULL, *attr_b; + struct ATTRIB *attr, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen; @@ -904,12 +904,8 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, *len = 0; up_read(&ni->file.run_lock); - if (*len) { - if (*lcn != SPARSE_LCN || !new) - return 0; /* Fast normal way without allocation. */ - else if (clen > *len) - clen = *len; - } + if (*len && (*lcn != SPARSE_LCN || !new)) + return 0; /* Fast normal way without allocation. */ /* No cluster in cache or we need to allocate cluster in hole. */ sbi = ni->mi.sbi; @@ -918,6 +914,17 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, ni_lock(ni); down_write(&ni->file.run_lock); + /* Repeat the code above (under write lock). */ + if (!run_lookup_entry(run, vcn, lcn, len, NULL)) + *len = 0; + + if (*len) { + if (*lcn != SPARSE_LCN || !new) + goto out; /* normal way without allocation. */ + if (clen > *len) + clen = *len; + } + le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { @@ -1736,8 +1743,10 @@ repack: le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); - if (!attr_b) - return -ENOENT; + if (!attr_b) { + err = -ENOENT; + goto out; + } attr = attr_b; le = le_b; @@ -1818,13 +1827,15 @@ ins_ext: ok: run_truncate_around(run, vcn); out: - if (new_valid > data_size) - new_valid = data_size; + if (attr_b) { + if (new_valid > data_size) + new_valid = data_size; - valid_size = le64_to_cpu(attr_b->nres.valid_size); - if (new_valid != valid_size) { - attr_b->nres.valid_size = cpu_to_le64(valid_size); - mi_b->dirty = true; + valid_size = le64_to_cpu(attr_b->nres.valid_size); + if (new_valid != valid_size) { + attr_b->nres.valid_size = cpu_to_le64(valid_size); + mi_b->dirty = true; + } } return err; @@ -2073,7 +2084,7 @@ next_attr: /* Update inode size. */ ni->i_valid = valid_size; - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); @@ -2488,7 +2499,7 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) mi_b->dirty = true; done: - ni->vfs_inode.i_size += bytes; + i_size_write(&ni->vfs_inode, ni->vfs_inode.i_size + bytes); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 7c01735d1219..9f4bd8d26090 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -29,7 +29,7 @@ static inline bool al_is_valid_le(const struct ntfs_inode *ni, void al_destroy(struct ntfs_inode *ni) { run_close(&ni->attr_list.run); - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; ni->attr_list.dirty = false; @@ -127,12 +127,13 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, { size_t off; u16 sz; + const unsigned le_min_size = le_size(0); if (!le) { le = ni->attr_list.le; } else { sz = le16_to_cpu(le->size); - if (sz < sizeof(struct ATTR_LIST_ENTRY)) { + if (sz < le_min_size) { /* Impossible 'cause we should not return such le. */ return NULL; } @@ -141,7 +142,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, /* Check boundary. */ off = PtrOffset(ni->attr_list.le, le); - if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) { + if (off + le_min_size > ni->attr_list.size) { /* The regular end of list. */ return NULL; } @@ -149,8 +150,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, sz = le16_to_cpu(le->size); /* Check le for errors. */ - if (sz < sizeof(struct ATTR_LIST_ENTRY) || - off + sz > ni->attr_list.size || + if (sz < le_min_size || off + sz > ni->attr_list.size || sz < le->name_off + le->name_len * sizeof(short)) { return NULL; } @@ -318,7 +318,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, memcpy(ptr, al->le, off); memcpy(Add2Ptr(ptr, off + sz), le, old_size - off); le = Add2Ptr(ptr, off); - kfree(al->le); + kvfree(al->le); al->le = ptr; } else { memmove(Add2Ptr(le, sz), le, old_size - off); diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 63f14a0232f6..845f9b22deef 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -124,7 +124,7 @@ void wnd_close(struct wnd_bitmap *wnd) { struct rb_node *node, *next; - kfree(wnd->free_bits); + kvfree(wnd->free_bits); wnd->free_bits = NULL; run_close(&wnd->run); @@ -1360,7 +1360,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short)); memset(new_free + wnd->nwnd, 0, (new_wnd - wnd->nwnd) * sizeof(short)); - kfree(wnd->free_bits); + kvfree(wnd->free_bits); wnd->free_bits = new_free; } diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index ec0566b322d5..5cf3d9decf64 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -309,11 +309,31 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; } - /* NTFS: symlinks are "dir + reparse" or "file + reparse" */ - if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) - dt_type = DT_LNK; - else - dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + /* + * NTFS: symlinks are "dir + reparse" or "file + reparse" + * Unfortunately reparse attribute is used for many purposes (several dozens). + * It is not possible here to know is this name symlink or not. + * To get exactly the type of name we should to open inode (read mft). + * getattr for opened file (fstat) correctly returns symlink. + */ + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + + /* + * It is not reliable to detect the type of name using duplicated information + * stored in parent directory. + * The only correct way to get the type of name - read MFT record and find ATTR_STD. + * The code below is not good idea. + * It does additional locks/reads just to get the type of name. + * Should we use additional mount option to enable branch below? + */ + if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) && + ino != ni->mi.rno) { + struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL); + if (!IS_ERR_OR_NULL(inode)) { + dt_type = fs_umode_to_dtype(inode->i_mode); + iput(inode); + } + } return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); } @@ -495,11 +515,9 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, struct INDEX_HDR *hdr; const struct ATTR_FILE_NAME *fname; u32 e_size, off, end; - u64 vbo = 0; size_t drs = 0, fles = 0, bit = 0; - loff_t i_size = ni->vfs_inode.i_size; struct indx_node *node = NULL; - u8 index_bits = ni->dir.index_bits; + size_t max_indx = i_size_read(&ni->vfs_inode) >> ni->dir.index_bits; if (is_empty) *is_empty = true; @@ -518,8 +536,10 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); if (e_size < sizeof(struct NTFS_DE) || - off + e_size > end) + off + e_size > end) { + /* Looks like corruption. */ break; + } if (de_is_last(e)) break; @@ -543,7 +563,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, fles += 1; } - if (vbo >= i_size) + if (bit >= max_indx) goto out; err = indx_used_bit(&ni->dir, ni, &bit); @@ -553,8 +573,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, if (bit == MINUS_ONE_T) goto out; - vbo = (u64)bit << index_bits; - if (vbo >= i_size) + if (bit >= max_indx) goto out; err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, @@ -564,7 +583,6 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, hdr = &node->index->ihdr; bit += 1; - vbo = (u64)bit << ni->dir.idx2vbn_bits; } out: @@ -593,5 +611,9 @@ const struct file_operations ntfs_dir_operations = { .iterate_shared = ntfs_readdir, .fsync = generic_file_fsync, .open = ntfs_file_open, + .unlocked_ioctl = ntfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ntfs_compat_ioctl, +#endif }; // clang-format on diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index a5a30a24ce5d..5418662c80d8 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -48,7 +48,7 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) return 0; } -static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) +long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; @@ -61,7 +61,7 @@ static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) } #ifdef CONFIG_COMPAT -static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) +long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) { return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); @@ -188,6 +188,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) u32 bh_next, bh_off, to; sector_t iblock; struct folio *folio; + bool dirty = false; for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; @@ -223,29 +224,27 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) /* Ok, it's mapped. Make sure it's up-to-date. */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - err = bh_read(bh, 0); - if (err < 0) { - folio_unlock(folio); - folio_put(folio); - goto out; - } + else if (bh_read(bh, 0) < 0) { + err = -EIO; + folio_unlock(folio); + folio_put(folio); + goto out; } mark_buffer_dirty(bh); - } while (bh_off = bh_next, iblock += 1, head != (bh = bh->b_this_page)); folio_zero_segment(folio, from, to); + dirty = true; folio_unlock(folio); folio_put(folio); cond_resched(); } out: - mark_inode_dirty(inode); + if (dirty) + mark_inode_dirty(inode); return err; } @@ -261,6 +260,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) bool rw = vma->vm_flags & VM_WRITE; int err; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "mmap encrypted not supported"); return -EOPNOTSUPP; @@ -499,10 +501,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_punch_hole(ni, vbo, len, &frame_size); ni_unlock(ni); + if (!err) + goto ok; + if (err != E_NTFS_NOTALIGNED) goto out; /* Process not aligned punch. */ + err = 0; mask = frame_size - 1; vbo_a = (vbo + mask) & ~mask; end_a = end & ~mask; @@ -525,6 +531,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL); ni_unlock(ni); + if (err) + goto out; } } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { /* @@ -564,6 +572,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_insert_range(ni, vbo, len); ni_unlock(ni); + if (err) + goto out; } else { /* Check new size. */ u8 cluster_bits = sbi->cluster_bits; @@ -633,11 +643,18 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) &ni->file.run, i_size, &ni->i_valid, true, NULL); ni_unlock(ni); + if (err) + goto out; } else if (new_size > i_size) { - inode->i_size = new_size; + i_size_write(inode, new_size); } } +ok: + err = file_modified(file); + if (err) + goto out; + out: if (map_locked) filemap_invalidate_unlock(mapping); @@ -663,6 +680,9 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode = inode->i_mode; int err; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + err = setattr_prepare(idmap, dentry, attr); if (err) goto out; @@ -676,7 +696,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out; } inode_dio_wait(inode); - oldsize = inode->i_size; + oldsize = i_size_read(inode); newsize = attr->ia_size; if (newsize <= oldsize) @@ -688,7 +708,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out; ni->ni_flags |= NI_FLAG_UPDATE_PARENT; - inode->i_size = newsize; + i_size_write(inode, newsize); } setattr_copy(idmap, inode, attr); @@ -718,6 +738,9 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -752,6 +775,9 @@ static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, struct inode *inode = in->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -821,7 +847,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(file); - loff_t i_size = inode->i_size; + loff_t i_size = i_size_read(inode); struct address_space *mapping = inode->i_mapping; struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; @@ -1028,6 +1054,8 @@ out: iocb->ki_pos += written; if (iocb->ki_pos > ni->i_valid) ni->i_valid = iocb->ki_pos; + if (iocb->ki_pos > i_size) + i_size_write(inode, iocb->ki_pos); return written; } @@ -1041,8 +1069,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; + int err; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -1068,6 +1100,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) goto out; + err = file_modified(iocb->ki_filp); + if (err) { + ret = err; + goto out; + } + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { /* Should never be here, see ntfs_file_open(). */ ret = -EOPNOTSUPP; @@ -1097,6 +1135,9 @@ int ntfs_file_open(struct inode *inode, struct file *file) { struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (unlikely((is_compressed(ni) || is_encrypted(ni)) && (file->f_flags & O_DIRECT))) { return -EOPNOTSUPP; @@ -1138,7 +1179,8 @@ static int ntfs_file_release(struct inode *inode, struct file *file) down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, - inode->i_size, &ni->i_valid, false, NULL); + i_size_read(inode), &ni->i_valid, false, + NULL); up_write(&ni->file.run_lock); ni_unlock(ni); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 3df2d9e34b91..7f27382e0ce2 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -778,7 +778,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) run_deallocate(sbi, &ni->attr_list.run, true); run_close(&ni->attr_list.run); ni->attr_list.size = 0; - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.dirty = false; @@ -927,7 +927,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) return 0; out: - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; return err; @@ -2099,7 +2099,7 @@ int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page) gfp_t gfp_mask; struct page *pg; - if (vbo >= ni->vfs_inode.i_size) { + if (vbo >= i_size_read(&ni->vfs_inode)) { SetPageUptodate(page); err = 0; goto out; @@ -2173,7 +2173,7 @@ int ni_decompress_file(struct ntfs_inode *ni) { struct ntfs_sb_info *sbi = ni->mi.sbi; struct inode *inode = &ni->vfs_inode; - loff_t i_size = inode->i_size; + loff_t i_size = i_size_read(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp_mask = mapping_gfp_mask(mapping); struct page **pages = NULL; @@ -2508,6 +2508,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, err = -EOPNOTSUPP; goto out1; #else + loff_t i_size = i_size_read(&ni->vfs_inode); u32 frame_bits = ni_ext_compress_bits(ni); u64 frame64 = frame_vbo >> frame_bits; u64 frames, vbo_data; @@ -2548,7 +2549,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, } } - frames = (ni->vfs_inode.i_size - 1) >> frame_bits; + frames = (i_size - 1) >> frame_bits; err = attr_wof_frame_info(ni, attr, run, frame64, frames, frame_bits, &ondisk_size, &vbo_data); @@ -2556,8 +2557,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, goto out2; if (frame64 == frames) { - unc_size = 1 + ((ni->vfs_inode.i_size - 1) & - (frame_size - 1)); + unc_size = 1 + ((i_size - 1) & (frame_size - 1)); ondisk_size = attr_size(attr) - vbo_data; } else { unc_size = frame_size; @@ -3259,6 +3259,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (is_bad_inode(inode) || sb_rdonly(sb)) return 0; + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + if (!ni_trylock(ni)) { /* 'ni' is under modification, skip for now. */ mark_inode_dirty_sync(inode); @@ -3288,7 +3291,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) modified = true; } - ts = inode_get_mtime(inode); + ts = inode_get_ctime(inode); dup.c_time = kernel2nt(&ts); if (std->c_time != dup.c_time) { std->c_time = dup.c_time; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 98ccb6650858..855519713bf7 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -465,7 +465,7 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) { const struct RESTART_AREA *ra; u16 cl, fl, ul; - u32 off, l_size, file_dat_bits, file_size_round; + u32 off, l_size, seq_bits; u16 ro = le16_to_cpu(rhdr->ra_off); u32 sys_page = le32_to_cpu(rhdr->sys_page_size); @@ -511,13 +511,15 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) /* Make sure the sequence number bits match the log file size. */ l_size = le64_to_cpu(ra->l_size); - file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits); - file_size_round = 1u << (file_dat_bits + 3); - if (file_size_round != l_size && - (file_size_round < l_size || (file_size_round / 2) > l_size)) { - return false; + seq_bits = sizeof(u64) * 8 + 3; + while (l_size) { + l_size >>= 1; + seq_bits -= 1; } + if (seq_bits != ra->seq_num_bits) + return false; + /* The log page data offset and record header length must be quad-aligned. */ if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) || !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8)) @@ -974,6 +976,16 @@ skip_looking: return e; } +struct restart_info { + u64 last_lsn; + struct RESTART_HDR *r_page; + u32 vbo; + bool chkdsk_was_run; + bool valid_page; + bool initialized; + bool restart; +}; + #define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001) #define NTFSLOG_WRAPPED 0x00000001 @@ -987,6 +999,7 @@ struct ntfs_log { struct ntfs_inode *ni; u32 l_size; + u32 orig_file_size; u32 sys_page_size; u32 sys_page_mask; u32 page_size; @@ -1040,6 +1053,8 @@ struct ntfs_log { struct CLIENT_ID client_id; u32 client_undo_commit; + + struct restart_info rst_info, rst_info2; }; static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn) @@ -1105,16 +1120,6 @@ static inline bool verify_client_lsn(struct ntfs_log *log, lsn <= le64_to_cpu(log->ra->current_lsn) && lsn; } -struct restart_info { - u64 last_lsn; - struct RESTART_HDR *r_page; - u32 vbo; - bool chkdsk_was_run; - bool valid_page; - bool initialized; - bool restart; -}; - static int read_log_page(struct ntfs_log *log, u32 vbo, struct RECORD_PAGE_HDR **buffer, bool *usa_error) { @@ -1176,7 +1181,7 @@ out: * restart page header. It will stop the first time we find a * valid page header. */ -static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, +static int log_read_rst(struct ntfs_log *log, bool first, struct restart_info *info) { u32 skip, vbo; @@ -1192,7 +1197,7 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, } /* Loop continuously until we succeed. */ - for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) { + for (; vbo < log->l_size; vbo = 2 * vbo + skip, skip = 0) { bool usa_error; bool brst, bchk; struct RESTART_AREA *ra; @@ -1285,22 +1290,17 @@ check_result: /* * Ilog_init_pg_hdr - Init @log from restart page header. */ -static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, - u32 page_size, u16 major_ver, u16 minor_ver) +static void log_init_pg_hdr(struct ntfs_log *log, u16 major_ver, u16 minor_ver) { - log->sys_page_size = sys_page_size; - log->sys_page_mask = sys_page_size - 1; - log->page_size = page_size; - log->page_mask = page_size - 1; - log->page_bits = blksize_bits(page_size); + log->sys_page_size = log->page_size; + log->sys_page_mask = log->page_mask; log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits; if (!log->clst_per_page) log->clst_per_page = 1; - log->first_page = major_ver >= 2 ? - 0x22 * page_size : - ((sys_page_size << 1) + (page_size << 1)); + log->first_page = major_ver >= 2 ? 0x22 * log->page_size : + 4 * log->page_size; log->major_ver = major_ver; log->minor_ver = minor_ver; } @@ -1308,12 +1308,11 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, /* * log_create - Init @log in cases when we don't have a restart area to use. */ -static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn, +static void log_create(struct ntfs_log *log, const u64 last_lsn, u32 open_log_count, bool wrapped, bool use_multi_page) { - log->l_size = l_size; /* All file offsets must be quadword aligned. */ - log->file_data_bits = blksize_bits(l_size) - 3; + log->file_data_bits = blksize_bits(log->l_size) - 3; log->seq_num_mask = (8 << log->file_data_bits) - 1; log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits; log->seq_num = (last_lsn >> log->file_data_bits) + 2; @@ -3720,10 +3719,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) struct ntfs_sb_info *sbi = ni->mi.sbi; struct ntfs_log *log; - struct restart_info rst_info, rst_info2; - u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0; + u64 rec_lsn, checkpt_lsn = 0, rlsn = 0; struct ATTR_NAME_ENTRY *attr_names = NULL; - struct ATTR_NAME_ENTRY *ane; struct RESTART_TABLE *dptbl = NULL; struct RESTART_TABLE *trtbl = NULL; const struct RESTART_TABLE *rt; @@ -3741,9 +3738,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) struct TRANSACTION_ENTRY *tr; struct DIR_PAGE_ENTRY *dp; u32 i, bytes_per_attr_entry; - u32 l_size = ni->vfs_inode.i_size; - u32 orig_file_size = l_size; - u32 page_size, vbo, tail, off, dlen; + u32 vbo, tail, off, dlen; u32 saved_len, rec_len, transact_id; bool use_second_page; struct RESTART_AREA *ra2, *ra = NULL; @@ -3758,52 +3753,50 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) u16 t16; u32 t32; - /* Get the size of page. NOTE: To replay we can use default page. */ -#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 - page_size = norm_file_page(PAGE_SIZE, &l_size, true); -#else - page_size = norm_file_page(PAGE_SIZE, &l_size, false); -#endif - if (!page_size) - return -EINVAL; - log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS); if (!log) return -ENOMEM; log->ni = ni; - log->l_size = l_size; - log->one_page_buf = kmalloc(page_size, GFP_NOFS); + log->l_size = log->orig_file_size = ni->vfs_inode.i_size; + /* Get the size of page. NOTE: To replay we can use default page. */ +#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 + log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, true); +#else + log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, false); +#endif + if (!log->page_size) { + err = -EINVAL; + goto out; + } + + log->one_page_buf = kmalloc(log->page_size, GFP_NOFS); if (!log->one_page_buf) { err = -ENOMEM; goto out; } - log->page_size = page_size; - log->page_mask = page_size - 1; - log->page_bits = blksize_bits(page_size); + log->page_mask = log->page_size - 1; + log->page_bits = blksize_bits(log->page_size); /* Look for a restart area on the disk. */ - memset(&rst_info, 0, sizeof(struct restart_info)); - err = log_read_rst(log, l_size, true, &rst_info); + err = log_read_rst(log, true, &log->rst_info); if (err) goto out; /* remember 'initialized' */ - *initialized = rst_info.initialized; + *initialized = log->rst_info.initialized; - if (!rst_info.restart) { - if (rst_info.initialized) { + if (!log->rst_info.restart) { + if (log->rst_info.initialized) { /* No restart area but the file is not initialized. */ err = -EINVAL; goto out; } - log_init_pg_hdr(log, page_size, page_size, 1, 1); - log_create(log, l_size, 0, get_random_u32(), false, false); - - log->ra = ra; + log_init_pg_hdr(log, 1, 1); + log_create(log, 0, get_random_u32(), false, false); ra = log_create_ra(log); if (!ra) { @@ -3820,25 +3813,26 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) * If the restart offset above wasn't zero then we won't * look for a second restart. */ - if (rst_info.vbo) + if (log->rst_info.vbo) goto check_restart_area; - memset(&rst_info2, 0, sizeof(struct restart_info)); - err = log_read_rst(log, l_size, false, &rst_info2); + err = log_read_rst(log, false, &log->rst_info2); if (err) goto out; /* Determine which restart area to use. */ - if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn) + if (!log->rst_info2.restart || + log->rst_info2.last_lsn <= log->rst_info.last_lsn) goto use_first_page; use_second_page = true; - if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) { + if (log->rst_info.chkdsk_was_run && + log->page_size != log->rst_info.vbo) { struct RECORD_PAGE_HDR *sp = NULL; bool usa_error; - if (!read_log_page(log, page_size, &sp, &usa_error) && + if (!read_log_page(log, log->page_size, &sp, &usa_error) && sp->rhdr.sign == NTFS_CHKD_SIGNATURE) { use_second_page = false; } @@ -3846,52 +3840,43 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) } if (use_second_page) { - kfree(rst_info.r_page); - memcpy(&rst_info, &rst_info2, sizeof(struct restart_info)); - rst_info2.r_page = NULL; + kfree(log->rst_info.r_page); + memcpy(&log->rst_info, &log->rst_info2, + sizeof(struct restart_info)); + log->rst_info2.r_page = NULL; } use_first_page: - kfree(rst_info2.r_page); + kfree(log->rst_info2.r_page); check_restart_area: /* * If the restart area is at offset 0, we want * to write the second restart area first. */ - log->init_ra = !!rst_info.vbo; + log->init_ra = !!log->rst_info.vbo; /* If we have a valid page then grab a pointer to the restart area. */ - ra2 = rst_info.valid_page ? - Add2Ptr(rst_info.r_page, - le16_to_cpu(rst_info.r_page->ra_off)) : + ra2 = log->rst_info.valid_page ? + Add2Ptr(log->rst_info.r_page, + le16_to_cpu(log->rst_info.r_page->ra_off)) : NULL; - if (rst_info.chkdsk_was_run || + if (log->rst_info.chkdsk_was_run || (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { bool wrapped = false; bool use_multi_page = false; u32 open_log_count; /* Do some checks based on whether we have a valid log page. */ - if (!rst_info.valid_page) { - open_log_count = get_random_u32(); - goto init_log_instance; - } - open_log_count = le32_to_cpu(ra2->open_log_count); - - /* - * If the restart page size isn't changing then we want to - * check how much work we need to do. - */ - if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size)) - goto init_log_instance; + open_log_count = log->rst_info.valid_page ? + le32_to_cpu(ra2->open_log_count) : + get_random_u32(); -init_log_instance: - log_init_pg_hdr(log, page_size, page_size, 1, 1); + log_init_pg_hdr(log, 1, 1); - log_create(log, l_size, rst_info.last_lsn, open_log_count, - wrapped, use_multi_page); + log_create(log, log->rst_info.last_lsn, open_log_count, wrapped, + use_multi_page); ra = log_create_ra(log); if (!ra) { @@ -3916,28 +3901,27 @@ init_log_instance: * use the log file. We must use the system page size instead of the * default size if there is not a clean shutdown. */ - t32 = le32_to_cpu(rst_info.r_page->sys_page_size); - if (page_size != t32) { - l_size = orig_file_size; - page_size = - norm_file_page(t32, &l_size, t32 == DefaultLogPageSize); + t32 = le32_to_cpu(log->rst_info.r_page->sys_page_size); + if (log->page_size != t32) { + log->l_size = log->orig_file_size; + log->page_size = norm_file_page(t32, &log->l_size, + t32 == DefaultLogPageSize); } - if (page_size != t32 || - page_size != le32_to_cpu(rst_info.r_page->page_size)) { + if (log->page_size != t32 || + log->page_size != le32_to_cpu(log->rst_info.r_page->page_size)) { err = -EINVAL; goto out; } /* If the file size has shrunk then we won't mount it. */ - if (l_size < le64_to_cpu(ra2->l_size)) { + if (log->l_size < le64_to_cpu(ra2->l_size)) { err = -EINVAL; goto out; } - log_init_pg_hdr(log, page_size, page_size, - le16_to_cpu(rst_info.r_page->major_ver), - le16_to_cpu(rst_info.r_page->minor_ver)); + log_init_pg_hdr(log, le16_to_cpu(log->rst_info.r_page->major_ver), + le16_to_cpu(log->rst_info.r_page->minor_ver)); log->l_size = le64_to_cpu(ra2->l_size); log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits); @@ -3945,7 +3929,7 @@ init_log_instance: log->seq_num_mask = (8 << log->file_data_bits) - 1; log->last_lsn = le64_to_cpu(ra2->current_lsn); log->seq_num = log->last_lsn >> log->file_data_bits; - log->ra_off = le16_to_cpu(rst_info.r_page->ra_off); + log->ra_off = le16_to_cpu(log->rst_info.r_page->ra_off); log->restart_size = log->sys_page_size - log->ra_off; log->record_header_len = le16_to_cpu(ra2->rec_hdr_len); log->ra_size = le16_to_cpu(ra2->ra_len); @@ -4045,7 +4029,7 @@ find_oldest: log->current_avail = current_log_avail(log); /* Remember which restart area to write first. */ - log->init_ra = rst_info.vbo; + log->init_ra = log->rst_info.vbo; process_log: /* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */ @@ -4105,7 +4089,7 @@ process_log: log->client_id.seq_num = cr->seq_num; log->client_id.client_idx = client; - err = read_rst_area(log, &rst, &ra_lsn); + err = read_rst_area(log, &rst, &checkpt_lsn); if (err) goto out; @@ -4114,9 +4098,8 @@ process_log: bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28; - checkpt_lsn = le64_to_cpu(rst->check_point_start); - if (!checkpt_lsn) - checkpt_lsn = ra_lsn; + if (rst->check_point_start) + checkpt_lsn = le64_to_cpu(rst->check_point_start); /* Allocate and Read the Transaction Table. */ if (!rst->transact_table_len) @@ -4330,23 +4313,20 @@ check_attr_table: lcb = NULL; check_attribute_names2: - if (!rst->attr_names_len) - goto trace_attribute_table; - - ane = attr_names; - if (!oatbl) - goto trace_attribute_table; - while (ane->off) { - /* TODO: Clear table on exit! */ - oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); - t16 = le16_to_cpu(ane->name_bytes); - oe->name_len = t16 / sizeof(short); - oe->ptr = ane->name; - oe->is_attr_name = 2; - ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16); - } - -trace_attribute_table: + if (rst->attr_names_len && oatbl) { + struct ATTR_NAME_ENTRY *ane = attr_names; + while (ane->off) { + /* TODO: Clear table on exit! */ + oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); + t16 = le16_to_cpu(ane->name_bytes); + oe->name_len = t16 / sizeof(short); + oe->ptr = ane->name; + oe->is_attr_name = 2; + ane = Add2Ptr(ane, + sizeof(struct ATTR_NAME_ENTRY) + t16); + } + } + /* * If the checkpt_lsn is zero, then this is a freshly * formatted disk and we have no work to do. @@ -5189,7 +5169,7 @@ out: kfree(oatbl); kfree(dptbl); kfree(attr_names); - kfree(rst_info.r_page); + kfree(log->rst_info.r_page); kfree(ra); kfree(log->one_page_buf); diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index fbfe21dbb425..ae2ef5c11868 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -853,7 +853,8 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) /* * sb can be NULL here. In this case sbi->flags should be 0 too. */ - if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR)) + if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR) || + unlikely(ntfs3_forced_shutdown(sb))) return; blocksize = sb->s_blocksize; @@ -1006,6 +1007,30 @@ static inline __le32 security_hash(const void *sd, size_t bytes) return cpu_to_le32(hash); } +/* + * simple wrapper for sb_bread_unmovable. + */ +struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct buffer_head *bh; + + if (unlikely(block >= sbi->volume.blocks)) { + /* prevent generic message "attempt to access beyond end of device" */ + ntfs_err(sb, "try to read out of volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; + } + + bh = sb_bread_unmovable(sb, block); + if (bh) + return bh; + + ntfs_err(sb, "failed to read volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; +} + int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer) { struct block_device *bdev = sb->s_bdev; @@ -2128,8 +2153,8 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, if (le32_to_cpu(d_security->size) == new_sec_size && d_security->key.hash == hash_key.hash && !memcmp(d_security + 1, sd, size_sd)) { - *security_id = d_security->key.sec_id; /* Such security already exists. */ + *security_id = d_security->key.sec_id; err = 0; goto out; } diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index cf92b2433f7a..daabaad63aaf 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1462,7 +1462,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, goto out2; if (in->name == I30_NAME) { - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, alloc_size); } @@ -1544,7 +1544,7 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, } if (in->name == I30_NAME) - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); *vbn = bit << indx->idx2vbn_bits; @@ -2090,7 +2090,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, return err; if (in->name == I30_NAME) - ni->vfs_inode.i_size = new_data; + i_size_write(&ni->vfs_inode, new_data); bpb = bitmap_size(bit); if (bpb * 8 == nbits) @@ -2576,7 +2576,7 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, 0, NULL, false, NULL); if (in->name == I30_NAME) - ni->vfs_inode.i_size = 0; + i_size_write(&ni->vfs_inode, 0); err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len, false, NULL); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 5e3d71374918..eb7a8c9fba01 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -345,9 +345,7 @@ next_attr: inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer .PrintNameLength) / sizeof(u16); - ni->i_valid = inode->i_size; - /* Clear directory bit. */ if (ni->ni_flags & NI_FLAG_DIR) { indx_clear(&ni->dir); @@ -412,7 +410,6 @@ end_enum: goto out; if (!is_match && name) { - /* Reuse rec as buffer for ascii name. */ err = -ENOENT; goto out; } @@ -427,6 +424,7 @@ end_enum: if (names != le16_to_cpu(rec->hard_links)) { /* Correct minor error on the fly. Do not mark inode as dirty. */ + ntfs_inode_warn(inode, "Correct links count -> %u.", names); rec->hard_links = cpu_to_le16(names); ni->mi.dirty = true; } @@ -653,9 +651,10 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, off = vbo & (PAGE_SIZE - 1); folio_set_bh(bh, folio, off); - err = bh_read(bh, 0); - if (err < 0) + if (bh_read(bh, 0) < 0) { + err = -EIO; goto out; + } folio_zero_segment(folio, off + voff, off + block_size); } } @@ -853,9 +852,13 @@ static int ntfs_resident_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; - struct ntfs_inode *ni = ntfs_i(mapping->host); + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); int ret; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + ni_lock(ni); ret = attr_data_write_resident(ni, &folio->page); ni_unlock(ni); @@ -869,7 +872,12 @@ static int ntfs_resident_writepage(struct folio *folio, static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - if (is_resident(ntfs_i(mapping->host))) + struct inode *inode = mapping->host; + + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + + if (is_resident(ntfs_i(inode))) return write_cache_pages(mapping, wbc, ntfs_resident_writepage, mapping); return mpage_writepages(mapping, wbc, ntfs_get_block); @@ -889,6 +897,9 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + *pagep = NULL; if (is_resident(ni)) { struct page *page = @@ -974,7 +985,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, } if (pos + err > inode->i_size) { - inode->i_size = pos + err; + i_size_write(inode, pos + err); dirty = true; } @@ -1306,6 +1317,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, goto out1; } + if (unlikely(ntfs3_forced_shutdown(sb))) { + err = -EIO; + goto out2; + } + /* Mark rw ntfs as dirty. it will be cleared at umount. */ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index ee3093be5170..084d19d78397 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -181,6 +181,9 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + ni_lock_dir(ni); err = ntfs_unlink_inode(dir, dentry); @@ -199,6 +202,9 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, u32 size = strlen(symname); struct inode *inode; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); @@ -227,6 +233,9 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + ni_lock_dir(ni); err = ntfs_unlink_inode(dir, dentry); @@ -264,6 +273,9 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, 1024); static_assert(PATH_MAX >= 4 * 1024); + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -419,7 +431,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, * fnd contains tree's path to insert to. * If fnd is not NULL then dir is locked. */ - inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni, + inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni, mode, 0, NULL, 0, fnd); err = IS_ERR(inode) ? PTR_ERR(inode) : finish_open(file, dentry, ntfs_file_open); diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 86aecbb01a92..9c7478150a03 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -523,12 +523,10 @@ struct ATTR_LIST_ENTRY { __le64 vcn; // 0x08: Starting VCN of this attribute. struct MFT_REF ref; // 0x10: MFT record number with attribute. __le16 id; // 0x18: struct ATTRIB ID. - __le16 name[3]; // 0x1A: Just to align. To get real name can use bNameOffset. + __le16 name[]; // 0x1A: To get real name use name_off. }; // sizeof(0x20) -static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20); - static inline u32 le_size(u8 name_len) { return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) + diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index f6706143d14b..79356fd29a14 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -61,6 +61,8 @@ enum utf16_endian; /* sbi->flags */ #define NTFS_FLAGS_NODISCARD 0x00000001 +/* ntfs in shutdown state. */ +#define NTFS_FLAGS_SHUTDOWN_BIT 0x00000002 /* == 4*/ /* Set when LogFile is replaying. */ #define NTFS_FLAGS_LOG_REPLAYING 0x00000008 /* Set when we changed first MFT's which copy must be updated in $MftMirr. */ @@ -226,7 +228,7 @@ struct ntfs_sb_info { u64 maxbytes; // Maximum size for normal files. u64 maxbytes_sparse; // Maximum size for sparse file. - u32 flags; // See NTFS_FLAGS_XXX. + unsigned long flags; // See NTFS_FLAGS_ CLST zone_max; // Maximum MFT zone length in clusters CLST bad_clusters; // The count of marked bad clusters. @@ -473,7 +475,7 @@ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { - return (size + 1023) & ~(size_t)1023; + return size_add(size, 1023) & ~(size_t)1023; } /* Globals from bitfunc.c */ @@ -500,6 +502,8 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg); +long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg); extern const struct inode_operations ntfs_special_inode_operations; extern const struct inode_operations ntfs_file_inode_operations; extern const struct file_operations ntfs_file_operations; @@ -584,6 +588,7 @@ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); int log_replay(struct ntfs_inode *ni, bool *initialized); /* Globals from fsntfs.c */ +struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block); bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes); int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, bool simple); @@ -872,7 +877,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); -extern const struct xattr_handler * const ntfs_xattr_handlers[]; +extern const struct xattr_handler *const ntfs_xattr_handlers[]; int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); @@ -999,6 +1004,11 @@ static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb) return sb->s_fs_info; } +static inline int ntfs3_forced_shutdown(struct super_block *sb) +{ + return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); +} + /* * ntfs_up_cluster - Align up on cluster boundary. */ @@ -1025,19 +1035,6 @@ static inline u64 bytes_to_block(const struct super_block *sb, u64 size) return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; } -static inline struct buffer_head *ntfs_bread(struct super_block *sb, - sector_t block) -{ - struct buffer_head *bh = sb_bread(sb, block); - - if (bh) - return bh; - - ntfs_err(sb, "failed to read volume at offset 0x%llx", - (u64)block << sb->s_blocksize_bits); - return NULL; -} - static inline struct ntfs_inode *ntfs_i(struct inode *inode) { return container_of(inode, struct ntfs_inode, vfs_inode); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 53629b1f65e9..6aa3a9d44df1 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -279,7 +279,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (t16 > asize) return NULL; - if (t16 + le32_to_cpu(attr->res.data_size) > asize) + if (le32_to_cpu(attr->res.data_size) > asize - t16) return NULL; t32 = sizeof(short) * attr->name_len; @@ -535,8 +535,20 @@ bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, return false; if (ni && is_attr_indexed(attr)) { - le16_add_cpu(&ni->mi.mrec->hard_links, -1); - ni->mi.dirty = true; + u16 links = le16_to_cpu(ni->mi.mrec->hard_links); + struct ATTR_FILE_NAME *fname = + attr->type != ATTR_NAME ? + NULL : + resident_data_ex(attr, + SIZEOF_ATTRIBUTE_FILENAME); + if (fname && fname->type == FILE_NAME_DOS) { + /* Do not decrease links count deleting DOS name. */ + } else if (!links) { + /* minor error. Not critical. */ + } else { + ni->mi.mrec->hard_links = cpu_to_le16(links - 1); + ni->mi.dirty = true; + } } used -= asize; diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 9153dffde950..cef5467fd928 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -122,13 +122,12 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) if (name) { struct dentry *de = d_find_alias(inode); - const u32 name_len = ARRAY_SIZE(s_name_buf) - 1; if (de) { spin_lock(&de->d_lock); - snprintf(name, name_len, " \"%s\"", de->d_name.name); + snprintf(name, sizeof(s_name_buf), " \"%s\"", + de->d_name.name); spin_unlock(&de->d_lock); - name[name_len] = 0; /* To be sure. */ } else { name[0] = 0; } @@ -625,7 +624,7 @@ static void ntfs3_free_sbi(struct ntfs_sb_info *sbi) { kfree(sbi->new_rec); kvfree(ntfs_put_shared(sbi->upcase)); - kfree(sbi->def_table); + kvfree(sbi->def_table); kfree(sbi->compress.lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS xpress_free_decompressor(sbi->compress.xpress); @@ -715,6 +714,14 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) } /* + * ntfs_shutdown - super_operations::shutdown + */ +static void ntfs_shutdown(struct super_block *sb) +{ + set_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); +} + +/* * ntfs_sync_fs - super_operations::sync_fs */ static int ntfs_sync_fs(struct super_block *sb, int wait) @@ -724,6 +731,9 @@ static int ntfs_sync_fs(struct super_block *sb, int wait) struct ntfs_inode *ni; struct inode *inode; + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + ni = sbi->security.ni; if (ni) { inode = &ni->vfs_inode; @@ -763,6 +773,7 @@ static const struct super_operations ntfs_sops = { .put_super = ntfs_put_super, .statfs = ntfs_statfs, .show_options = ntfs_show_options, + .shutdown = ntfs_shutdown, .sync_fs = ntfs_sync_fs, .write_inode = ntfs3_write_inode, }; @@ -866,6 +877,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u16 fn, ao; u8 cluster_bits; u32 boot_off = 0; + sector_t boot_block = 0; const char *hint = "Primary boot"; /* Save original dev_size. Used with alternative boot. */ @@ -873,11 +885,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->volume.blocks = dev_size >> PAGE_SHIFT; - bh = ntfs_bread(sb, 0); +read_boot: + bh = ntfs_bread(sb, boot_block); if (!bh) - return -EIO; + return boot_block ? -EINVAL : -EIO; -check_boot: err = -EINVAL; /* Corrupted image; do not read OOB */ @@ -1108,26 +1120,24 @@ check_boot: } out: - if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) { + brelse(bh); + + if (err == -EINVAL && !boot_block && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); u64 lbo = dev_size0 - sizeof(*boot); - /* - * Try alternative boot (last sector) - */ - brelse(bh); - - sb_set_blocksize(sb, block_size); - bh = ntfs_bread(sb, lbo >> blksize_bits(block_size)); - if (!bh) - return -EINVAL; - + boot_block = lbo >> blksize_bits(block_size); boot_off = lbo & (block_size - 1); - hint = "Alternative boot"; - dev_size = dev_size0; /* restore original size. */ - goto check_boot; + if (boot_block && block_size >= boot_off + sizeof(*boot)) { + /* + * Try alternative boot (last sector) + */ + sb_set_blocksize(sb, block_size); + hint = "Alternative boot"; + dev_size = dev_size0; /* restore original size. */ + goto read_boot; + } } - brelse(bh); return err; } diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 4274b6f31cfa..53e7d1fa036a 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -219,6 +219,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, if (!ea->name_len) break; + if (ea->name_len > ea_size) + break; + if (buffer) { /* Check if we can use field ea->name */ if (off + ea_size > size) @@ -744,6 +747,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, int err; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + /* Dispatch request. */ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { /* system.dos_attrib */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4d7efefa98c5..1bde1281d514 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -213,7 +213,7 @@ struct o2hb_region { unsigned int hr_num_pages; struct page **hr_slot_data; - struct bdev_handle *hr_bdev_handle; + struct file *hr_bdev_file; struct o2hb_disk_slot *hr_slots; /* live node map of this region */ @@ -263,7 +263,7 @@ struct o2hb_region { static inline struct block_device *reg_bdev(struct o2hb_region *reg) { - return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL; + return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL; } struct o2hb_bio_wait_ctxt { @@ -1509,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slot_data); } - if (reg->hr_bdev_handle) - bdev_release(reg->hr_bdev_handle); + if (reg->hr_bdev_file) + fput(reg->hr_bdev_file); kfree(reg->hr_slots); @@ -1569,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item, unsigned long block_bytes; unsigned int block_bits; - if (reg->hr_bdev_handle) + if (reg->hr_bdev_file) return -EINVAL; status = o2hb_read_block_input(reg, page, &block_bytes, @@ -1598,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item, char *p = (char *)page; ssize_t ret; - if (reg->hr_bdev_handle) + if (reg->hr_bdev_file) return -EINVAL; ret = kstrtoull(p, 0, &tmp); @@ -1623,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item, unsigned long tmp; char *p = (char *)page; - if (reg->hr_bdev_handle) + if (reg->hr_bdev_file) return -EINVAL; tmp = simple_strtoul(p, &p, 0); @@ -1642,7 +1642,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) { unsigned int ret = 0; - if (to_o2hb_region(item)->hr_bdev_handle) + if (to_o2hb_region(item)->hr_bdev_file) ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item))); return ret; @@ -1753,7 +1753,7 @@ out: } /* - * this is acting as commit; we set up all of hr_bdev_handle and hr_task or + * this is acting as commit; we set up all of hr_bdev_file and hr_task or * nothing */ static ssize_t o2hb_region_dev_store(struct config_item *item, @@ -1769,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, ssize_t ret = -EINVAL; int live_threshold; - if (reg->hr_bdev_handle) + if (reg->hr_bdev_file) goto out; /* We can't heartbeat without having had our node number @@ -1795,11 +1795,11 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, if (!S_ISBLK(f.file->f_mapping->host->i_mode)) goto out2; - reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev, + reg->hr_bdev_file = bdev_file_open_by_dev(f.file->f_mapping->host->i_rdev, BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); - if (IS_ERR(reg->hr_bdev_handle)) { - ret = PTR_ERR(reg->hr_bdev_handle); - reg->hr_bdev_handle = NULL; + if (IS_ERR(reg->hr_bdev_file)) { + ret = PTR_ERR(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; goto out2; } @@ -1903,8 +1903,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, out3: if (ret < 0) { - bdev_release(reg->hr_bdev_handle); - reg->hr_bdev_handle = NULL; + fput(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; } out2: fdput(f); diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index f37174e79fad..6de944818c56 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -27,7 +27,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; - if (fl->fl_type == F_WRLCK) + if (lock_is_write(fl)) level = 1; if (!IS_SETLKW(cmd)) trylock = 1; @@ -53,8 +53,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, */ locks_init_lock(&request); - request.fl_type = F_UNLCK; - request.fl_flags = FL_FLOCK; + request.c.flc_type = F_UNLCK; + request.c.flc_flags = FL_FLOCK; locks_lock_file_wait(file, &request); ocfs2_file_unlock(file); @@ -100,14 +100,14 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) return locks_lock_file_wait(file, fl); - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) return ocfs2_do_funlock(file, cmd, fl); else return ocfs2_do_flock(file, inode, cmd, fl); @@ -118,7 +118,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!(fl->fl_flags & FL_POSIX)) + if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 9b76ee66aeb2..c11406cd87a8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -744,7 +744,7 @@ static int user_plock(struct ocfs2_cluster_connection *conn, return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl); else if (IS_GETLK(cmd)) return dlm_posix_get(conn->cc_lockspace, ino, file, fl); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); else return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 6b906424902b..a70aff17d455 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2027,8 +2027,8 @@ static int ocfs2_initialize_super(struct super_block *sb, cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); - memcpy(&sb->s_uuid, di->id2.i_super.s_uuid, - sizeof(di->id2.i_super.s_uuid)); + super_set_uuid(sb, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; diff --git a/fs/open.c b/fs/open.c index a84d21e55c39..a7d4bb2c725f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -154,49 +154,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +long do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; - struct fd f; int error; - error = -EINVAL; - if (length < 0) - goto out; - error = -EBADF; - f = fdget(fd); - if (!f.file) - goto out; - /* explicitly opened as large or we are on 64-bit box */ - if (f.file->f_flags & O_LARGEFILE) + if (file->f_flags & O_LARGEFILE) small = 0; - dentry = f.file->f_path.dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; - error = -EINVAL; - if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) - goto out_putf; + if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + return -EINVAL; - error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) - goto out_putf; + return -EINVAL; - error = -EPERM; /* Check IS_APPEND on real upper inode */ - if (IS_APPEND(file_inode(f.file))) - goto out_putf; + if (IS_APPEND(file_inode(file))) + return -EPERM; sb_start_write(inode->i_sb); - error = security_file_truncate(f.file); + error = security_file_truncate(file); if (!error) - error = do_truncate(file_mnt_idmap(f.file), dentry, length, - ATTR_MTIME | ATTR_CTIME, f.file); + error = do_truncate(file_mnt_idmap(file), dentry, length, + ATTR_MTIME | ATTR_CTIME, file); sb_end_write(inode->i_sb); -out_putf: + + return error; +} + +long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +{ + struct fd f; + int error; + + if (length < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + + error = do_ftruncate(f.file, length, small); + fdput(f); -out: return error; } @@ -1364,7 +1367,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); - + if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index c4b65a6d41cc..4a0779e3ef79 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -446,7 +446,7 @@ static int __init init_openprom_fs(void) sizeof(struct op_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | SLAB_ACCOUNT), + SLAB_ACCOUNT), op_inode_init_once); if (!op_inode_cachep) return -ENOMEM; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b8e25ca51016..8586e2f5d243 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -265,20 +265,18 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, if (IS_ERR(old_file)) return PTR_ERR(old_file); + /* Try to use clone_file_range to clone up within the same fs */ + cloned = vfs_clone_file_range(old_file, 0, new_file, 0, len, 0); + if (cloned == len) + goto out_fput; + + /* Couldn't clone, so now we try to copy the data */ error = rw_verify_area(READ, old_file, &old_pos, len); if (!error) error = rw_verify_area(WRITE, new_file, &new_pos, len); if (error) goto out_fput; - /* Try to use clone_file_range to clone up within the same fs */ - ovl_start_write(dentry); - cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); - ovl_end_write(dentry); - if (cloned == len) - goto out_fput; - /* Couldn't clone, so now we try to copy the data */ - /* Check if lower fs supports seek operation */ if (old_file->f_mode & FMODE_LSEEK) skip_hole = true; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 984ffdaeed6c..5764f91d283e 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -18,10 +18,11 @@ struct ovl_lookup_data { struct super_block *sb; - struct vfsmount *mnt; + const struct ovl_layer *layer; struct qstr name; bool is_dir; bool opaque; + bool xwhiteouts; bool stop; bool last; char *redirect; @@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, return real; } -static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path) -{ - return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); -} - static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, const char *name, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, + base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, size_t prelen, const char *post, struct dentry **ret, bool drop_negative) { + struct ovl_fs *ofs = OVL_FS(d->sb); struct dentry *this; struct path path; int err; bool last_element = !post[0]; + bool is_upper = d->layer->idx == 0; + char val; this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { @@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, } path.dentry = this; - path.mnt = d->mnt; - if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) { + path.mnt = d->layer->mnt; + if (ovl_path_is_whiteout(ofs, &path)) { d->stop = d->opaque = true; goto put_and_out; } @@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL); + err = ovl_check_metacopy_xattr(ofs, &path, NULL); if (err < 0) goto out_err; @@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, if (d->last) goto out; - if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) { + /* overlay.opaque=x means xwhiteouts directory */ + val = ovl_get_opaquedir_val(ofs, &path); + if (last_element && !is_upper && val == 'x') { + d->xwhiteouts = true; + ovl_layer_set_xwhiteouts(ofs, d->layer); + } else if (val == 'y') { d->stop = true; if (last_element) d->opaque = true; @@ -863,7 +868,8 @@ fail: * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. */ -int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); @@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) BUG_ON(idx < 0); if (idx == 0) { ovl_path_upper(dentry, path); - if (path->dentry) + if (path->dentry) { + *layer = &OVL_FS(dentry->d_sb)->layers[0]; return ovl_numlower(oe) ? 1 : -1; + } idx++; } BUG_ON(idx > ovl_numlower(oe)); path->dentry = lowerstack[idx - 1].dentry; - path->mnt = lowerstack[idx - 1].layer->mnt; + *layer = lowerstack[idx - 1].layer; + path->mnt = (*layer)->mnt; return (idx < ovl_numlower(oe)) ? idx + 1 : -1; } @@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { - d.mnt = ovl_upper_mnt(ofs); + d.layer = &ofs->layers[0]; err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; @@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, else if (d.is_dir || !ofs->numdatalayer) d.last = lower.layer->idx == ovl_numlower(roe); - d.mnt = lower.layer->mnt; + d.layer = lower.layer; err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; @@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperopaque) ovl_dentry_set_opaque(dentry); + if (d.xwhiteouts) + ovl_dentry_set_xwhiteouts(dentry); if (upperdentry) ovl_dentry_set_upper_alias(dentry); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 5ba11eb43767..ee949f3e7c77 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -50,7 +50,6 @@ enum ovl_xattr { OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, OVL_XATTR_XWHITEOUT, - OVL_XATTR_XWHITEOUTS, }; enum ovl_inode_flag { @@ -70,6 +69,8 @@ enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, OVL_E_CONNECTED, + /* Lower stack may contain xwhiteout entries */ + OVL_E_XWHITEOUTS, }; enum { @@ -477,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry); +void ovl_dentry_set_xwhiteouts(struct dentry *dentry); +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer); bool ovl_dentry_has_upper_alias(struct dentry *dentry); void ovl_dentry_set_upper_alias(struct dentry *dentry); bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); @@ -494,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox); +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path); -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath); @@ -573,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb, .mnt = ovl_upper_mnt(ofs), }; - return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE); + return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y'; +} + +static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs, + const struct path *path) +{ + return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE); } static inline bool ovl_redirect_follow(struct ovl_fs *ofs) @@ -680,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); -int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer); int ovl_verify_lowerdata(struct dentry *dentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 5fa9c58af65f..cb449ab310a7 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -40,6 +40,8 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; + /* xwhiteouts were found on this layer */ + bool has_xwhiteouts; }; struct ovl_path { @@ -59,7 +61,7 @@ struct ovl_fs { unsigned int numfs; /* Number of data-only lower layers */ unsigned int numdatalayer; - const struct ovl_layer *layers; + struct ovl_layer *layers; struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 112b4b12f825..36dcc530ac28 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -280,12 +280,20 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, { struct ovl_fs_context *ctx = fc->fs_private; - if (ovl_dentry_weird(path->dentry)) - return invalfc(fc, "filesystem on %s not supported", name); - if (!d_is_dir(path->dentry)) return invalfc(fc, "%s is not a directory", name); + /* + * Root dentries of case-insensitive capable filesystems might + * not have the dentry operations set, but still be incompatible + * with overlayfs. Check explicitly to prevent post-mount + * failures. + */ + if (sb_has_encoding(path->mnt->mnt_sb)) + return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name); + + if (ovl_dentry_weird(path->dentry)) + return invalfc(fc, "filesystem on %s not supported", name); /* * Check whether upper path is read-only here to report failures diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index e71156baa7bc..0ca8af060b0c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); - rdd->in_xwhiteouts_dir = rdd->dentry && - ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { @@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, .is_lowest = false, }; int idx, next; + const struct ovl_layer *layer; for (idx = 0; idx != -1; idx = next) { - next = ovl_path_next(idx, dentry, &realpath); + next = ovl_path_next(idx, dentry, &realpath, &layer); rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; + rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && + ovl_dentry_has_xwhiteouts(dentry); if (next != -1) { err = ovl_dir_read(&realpath, &rdd); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 4ab66e3d4cff..36d4b8b1f784 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -28,41 +28,38 @@ MODULE_LICENSE("GPL"); struct ovl_dir_cache; -static struct dentry *ovl_d_real(struct dentry *dentry, - const struct inode *inode) +static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type) { - struct dentry *real = NULL, *lower; + struct dentry *upper, *lower; int err; - /* - * vfs is only expected to call d_real() with NULL from d_real_inode() - * and with overlay inode from file_dentry() on an overlay file. - * - * TODO: remove @inode argument from d_real() API, remove code in this - * function that deals with non-NULL @inode and remove d_real() call - * from file_dentry(). - */ - if (inode && d_inode(dentry) == inode) - return dentry; - else if (inode) + switch (type) { + case D_REAL_DATA: + case D_REAL_METADATA: + break; + default: goto bug; + } if (!d_is_reg(dentry)) { /* d_real_inode() is only relevant for regular files */ return dentry; } - real = ovl_dentry_upper(dentry); - if (real && (inode == d_inode(real))) - return real; + upper = ovl_dentry_upper(dentry); + if (upper && (type == D_REAL_METADATA || + ovl_has_upperdata(d_inode(dentry)))) + return upper; - if (real && !inode && ovl_has_upperdata(d_inode(dentry))) - return real; + if (type == D_REAL_METADATA) { + lower = ovl_dentry_lower(dentry); + goto real_lower; + } /* - * Best effort lazy lookup of lowerdata for !inode case to return + * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return * the real lowerdata dentry. The only current caller of d_real() with - * NULL inode is d_real_inode() from trace_uprobe and this caller is + * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is * likely going to be followed reading from the file, before placing * uprobes on offset within the file, so lowerdata should be available * when setting the uprobe. @@ -73,18 +70,13 @@ static struct dentry *ovl_d_real(struct dentry *dentry, lower = ovl_dentry_lowerdata(dentry); if (!lower) goto bug; - real = lower; - /* Handle recursion */ - real = d_real(real, inode); +real_lower: + /* Handle recursion into stacked lower fs */ + return d_real(lower, type); - if (!inode || inode == d_inode(real)) - return real; bug: - WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n", - __func__, dentry, inode ? inode->i_sb->s_id : "NULL", - inode ? inode->i_ino : 0, real, - real && d_inode(real) ? d_inode(real)->i_ino : 0); + WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type); return dentry; } @@ -1249,6 +1241,7 @@ static struct dentry *ovl_get_root(struct super_block *sb, struct ovl_entry *oe) { struct dentry *root; + struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path *lowerpath = ovl_lowerstack(oe); unsigned long ino = d_inode(lowerpath->dentry)->i_ino; int fsid = lowerpath->layer->fsid; @@ -1270,6 +1263,20 @@ static struct dentry *ovl_get_root(struct super_block *sb, ovl_set_flag(OVL_IMPURE, d_inode(root)); } + /* Look for xwhiteouts marker except in the lowermost layer */ + for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) { + struct path path = { + .mnt = lowerpath->layer->mnt, + .dentry = lowerpath->dentry, + }; + + /* overlay.opaque=x means xwhiteouts directory */ + if (ovl_get_opaquedir_val(ofs, &path) == 'x') { + ovl_layer_set_xwhiteouts(ofs, lowerpath->layer); + ovl_dentry_set_xwhiteouts(root); + } + } + /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root)); ovl_dentry_set_flag(OVL_E_CONNECTED, root); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 0217094c23ea..d285d1d7baad 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry) ovl_dentry_set_flag(OVL_E_OPAQUE, dentry); } +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry) +{ + return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry); +} + +void ovl_dentry_set_xwhiteouts(struct dentry *dentry) +{ + ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry); +} + +/* + * ovl_layer_set_xwhiteouts() is called before adding the overlay dir + * dentry to dcache, while readdir of that same directory happens after + * the overlay dir dentry is in dcache, so if some cpu observes that + * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts + * for the layers where xwhiteouts marker was found in that merge dir. + */ +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer) +{ + if (layer->has_xwhiteouts) + return; + + /* Write once to read-mostly layer properties */ + ofs->layers[layer->idx].has_xwhiteouts = true; +} + /* * For hard links and decoded file handles, it's possible for ovl_dentry_upper() * to return positive, while there's no actual upper alias for the inode. @@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path) return res >= 0; } -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path) -{ - struct dentry *dentry = path->dentry; - int res; - - /* xattr.whiteouts must be a directory */ - if (!d_is_dir(dentry)) - return false; - - res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0); - return res >= 0; -} - /* * Load persistent uuid from xattr into s_uuid if found, or store a new * random generated value in s_uuid and in xattr. @@ -760,13 +774,14 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath) { bool set = false; + uuid_t uuid; int res; /* Try to load existing persistent uuid */ - res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, sb->s_uuid.b, + res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, uuid.b, UUID_SIZE); if (res == UUID_SIZE) - return true; + goto set_uuid; if (res != -ENODATA) goto fail; @@ -794,37 +809,37 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, } /* Generate overlay instance uuid */ - uuid_gen(&sb->s_uuid); + uuid_gen(&uuid); /* Try to store persistent uuid */ set = true; - res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, sb->s_uuid.b, + res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, uuid.b, UUID_SIZE); - if (res == 0) - return true; + if (res) + goto fail; + +set_uuid: + super_set_uuid(sb, uuid.b, sizeof(uuid)); + return true; fail: - memset(sb->s_uuid.b, 0, UUID_SIZE); ofs->config.uuid = OVL_UUID_NULL; pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n", set ? "set" : "get", upperpath->dentry, res); return false; } -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox) +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox) { int res; char val; if (!d_is_dir(path->dentry)) - return false; + return 0; res = ovl_path_getxattr(ofs, path, ox, &val, 1); - if (res == 1 && val == 'y') - return true; - - return false; + return res == 1 ? val : 0; } #define OVL_XATTR_OPAQUE_POSTFIX "opaque" @@ -837,7 +852,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_PROTATTR_POSTFIX "protattr" #define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout" -#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ @@ -854,7 +868,6 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT), - OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS), }; int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, diff --git a/fs/pidfs.c b/fs/pidfs.c new file mode 100644 index 000000000000..8fd71a00be9c --- /dev/null +++ b/fs/pidfs.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/magic.h> +#include <linux/mount.h> +#include <linux/pid.h> +#include <linux/pidfs.h> +#include <linux/pid_namespace.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/pseudo_fs.h> +#include <linux/seq_file.h> +#include <uapi/linux/pidfd.h> + +#include "internal.h" + +static int pidfd_release(struct inode *inode, struct file *file) +{ +#ifndef CONFIG_FS_PID + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); +#endif + return 0; +} + +#ifdef CONFIG_PROC_FS +/** + * pidfd_show_fdinfo - print information about a pidfd + * @m: proc fdinfo file + * @f: file referencing a pidfd + * + * Pid: + * This function will print the pid that a given pidfd refers to in the + * pid namespace of the procfs instance. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its pid. This is + * similar to calling getppid() on a process whose parent is outside of + * its pid namespace. + * + * NSpid: + * If pid namespaces are supported then this function will also print + * the pid of a given pidfd refers to for all descendant pid namespaces + * starting from the current pid namespace of the instance, i.e. the + * Pid field and the first entry in the NSpid field will be identical. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its first NSpid + * entry and no others will be shown. + * Note that this differs from the Pid and NSpid fields in + * /proc/<pid>/status where Pid and NSpid are always shown relative to + * the pid namespace of the procfs instance. The difference becomes + * obvious when sending around a pidfd between pid namespaces from a + * different branch of the tree, i.e. where no ancestral relation is + * present between the pid namespaces: + * - create two new pid namespaces ns1 and ns2 in the initial pid + * namespace (also take care to create new mount namespaces in the + * new pid namespace and mount procfs) + * - create a process with a pidfd in ns1 + * - send pidfd from ns1 to ns2 + * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid + * have exactly one entry, which is 0 + */ +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid *pid = pidfd_pid(f); + struct pid_namespace *ns; + pid_t nr = -1; + + if (likely(pid_has_task(pid, PIDTYPE_PID))) { + ns = proc_pid_ns(file_inode(m->file)->i_sb); + nr = pid_nr_ns(pid, ns); + } + + seq_put_decimal_ll(m, "Pid:\t", nr); + +#ifdef CONFIG_PID_NS + seq_put_decimal_ll(m, "\nNSpid:\t", nr); + if (nr > 0) { + int i; + + /* If nr is non-zero it means that 'pid' is valid and that + * ns, i.e. the pid namespace associated with the procfs + * instance, is in the pid namespace hierarchy of pid. + * Start at one below the already printed level. + */ + for (i = ns->level + 1; i <= pid->level; i++) + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); + } +#endif + seq_putc(m, '\n'); +} +#endif + +/* + * Poll support for process exit notification. + */ +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) +{ + struct pid *pid = pidfd_pid(file); + bool thread = file->f_flags & PIDFD_THREAD; + struct task_struct *task; + __poll_t poll_flags = 0; + + poll_wait(file, &pid->wait_pidfd, pts); + /* + * Depending on PIDFD_THREAD, inform pollers when the thread + * or the whole thread-group exits. + */ + guard(rcu)(); + task = pid_task(pid, PIDTYPE_PID); + if (!task) + poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; + else if (task->exit_state && (thread || thread_group_empty(task))) + poll_flags = EPOLLIN | EPOLLRDNORM; + + return poll_flags; +} + +static const struct file_operations pidfs_file_operations = { + .release = pidfd_release, + .poll = pidfd_poll, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +struct pid *pidfd_pid(const struct file *file) +{ + if (file->f_op != &pidfs_file_operations) + return ERR_PTR(-EBADF); +#ifdef CONFIG_FS_PID + return file_inode(file)->i_private; +#else + return file->private_data; +#endif +} + +#ifdef CONFIG_FS_PID +static struct vfsmount *pidfs_mnt __ro_after_init; + +/* + * The vfs falls back to simple_setattr() if i_op->setattr() isn't + * implemented. Let's reject it completely until we have a clean + * permission concept for pidfds. + */ +static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EOPNOTSUPP; +} + +static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + return 0; +} + +static const struct inode_operations pidfs_inode_operations = { + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, +}; + +static void pidfs_evict_inode(struct inode *inode) +{ + struct pid *pid = inode->i_private; + + clear_inode(inode); + put_pid(pid); +} + +static const struct super_operations pidfs_sops = { + .drop_inode = generic_delete_inode, + .evict_inode = pidfs_evict_inode, + .statfs = simple_statfs, +}; + +static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(buffer, buflen, "pidfd:[%lu]", + d_inode(dentry)->i_ino); +} + +static const struct dentry_operations pidfs_dentry_operations = { + .d_delete = always_delete_dentry, + .d_dname = pidfs_dname, + .d_prune = stashed_dentry_prune, +}; + +static void pidfs_init_inode(struct inode *inode, void *data) +{ + inode->i_private = data; + inode->i_flags |= S_PRIVATE; + inode->i_mode |= S_IRWXU; + inode->i_op = &pidfs_inode_operations; + inode->i_fop = &pidfs_file_operations; +} + +static void pidfs_put_data(void *data) +{ + struct pid *pid = data; + put_pid(pid); +} + +static const struct stashed_operations pidfs_stashed_ops = { + .init_inode = pidfs_init_inode, + .put_data = pidfs_put_data, +}; + +static int pidfs_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx; + + ctx = init_pseudo(fc, PID_FS_MAGIC); + if (!ctx) + return -ENOMEM; + + ctx->ops = &pidfs_sops; + ctx->dops = &pidfs_dentry_operations; + fc->s_fs_info = (void *)&pidfs_stashed_ops; + return 0; +} + +static struct file_system_type pidfs_type = { + .name = "pidfs", + .init_fs_context = pidfs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + + struct file *pidfd_file; + struct path path; + int ret; + + /* + * Inode numbering for pidfs start at RESERVED_PIDS + 1. + * This avoids collisions with the root inode which is 1 + * for pseudo filesystems. + */ + ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt, + get_pid(pid), &path); + if (ret < 0) + return ERR_PTR(ret); + + pidfd_file = dentry_open(&path, flags, current_cred()); + path_put(&path); + return pidfd_file; +} + +void __init pidfs_init(void) +{ + pidfs_mnt = kern_mount(&pidfs_type); + if (IS_ERR(pidfs_mnt)) + panic("Failed to mount pidfs pseudo filesystem"); +} + +bool is_pidfs_sb(const struct super_block *sb) +{ + return sb == pidfs_mnt->mnt_sb; +} + +#else /* !CONFIG_FS_PID */ + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + struct file *pidfd_file; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid, + flags | O_RDWR); + if (IS_ERR(pidfd_file)) + return pidfd_file; + + get_pid(pid); + return pidfd_file; +} + +void __init pidfs_init(void) { } +bool is_pidfs_sb(const struct super_block *sb) +{ + return false; +} +#endif diff --git a/fs/pipe.c b/fs/pipe.c index f1adbfe743d4..50c8a8596b52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 */ -static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) +#define cmp_int(l, r) ((l > r) - (l < r)) + +#ifdef CONFIG_PROVE_LOCKING +static int pipe_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) { - if (pipe->files) - mutex_lock_nested(&pipe->mutex, subclass); + return cmp_int((unsigned long) a, (unsigned long) b); } +#endif void pipe_lock(struct pipe_inode_info *pipe) { - /* - * pipe_lock() nests non-pipe inode locks (for writing to a file) - */ - pipe_lock_nested(pipe, I_MUTEX_PARENT); + if (pipe->files) + mutex_lock(&pipe->mutex); } EXPORT_SYMBOL(pipe_lock); @@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe) } EXPORT_SYMBOL(pipe_unlock); -static inline void __pipe_lock(struct pipe_inode_info *pipe) -{ - mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); -} - -static inline void __pipe_unlock(struct pipe_inode_info *pipe) -{ - mutex_unlock(&pipe->mutex); -} - void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { BUG_ON(pipe1 == pipe2); - if (pipe1 < pipe2) { - pipe_lock_nested(pipe1, I_MUTEX_PARENT); - pipe_lock_nested(pipe2, I_MUTEX_CHILD); - } else { - pipe_lock_nested(pipe2, I_MUTEX_PARENT); - pipe_lock_nested(pipe1, I_MUTEX_CHILD); - } + if (pipe1 > pipe2) + swap(pipe1, pipe2); + + pipe_lock(pipe1); + pipe_lock(pipe2); } static void anon_pipe_buf_release(struct pipe_inode_info *pipe, @@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) return 0; ret = 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); /* * We only wake up writers if the pipe was full when we started @@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) ret = -EAGAIN; break; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); /* * We only get here if we didn't actually read anything. @@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); @@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) if (unlikely(total_len == 0)) return 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); @@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * after waiting we need to re-check whether the pipe * become empty while we dropped the lock. */ - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); was_empty = pipe_empty(pipe->head, pipe->tail); wake_next_writer = true; } out: if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) wake_next_writer = false; - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); /* * If we do do a wakeup event, we do a 'sync' wakeup, because we @@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); count = 0; head = pipe->head; tail = pipe->tail; @@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) count += pipe->bufs[tail & mask].len; tail++; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return put_user(count, (int __user *)arg); #ifdef CONFIG_WATCH_QUEUE case IOC_WATCH_QUEUE_SET_SIZE: { int ret; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); ret = watch_queue_set_size(pipe, arg); - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return ret; } @@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file) { struct pipe_inode_info *pipe = file->private_data; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (file->f_mode & FMODE_READ) pipe->readers--; if (file->f_mode & FMODE_WRITE) @@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return 0; @@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on) struct pipe_inode_info *pipe = filp->private_data; int retval = 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (filp->f_mode & FMODE_READ) retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { @@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on) /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return retval; } @@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void) pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); + lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL); return pipe; } @@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp) filp->private_data = pipe; /* OK, we have a pipe and it's pinned down */ - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); /* We can only do regular read/write on fifos */ stream_open(inode, filp); @@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return 0; err_rd: @@ -1230,7 +1221,7 @@ err_wr: goto err; err: - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return ret; @@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) if (!pipe) return -EBADF; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); switch (cmd) { case F_SETPIPE_SZ: @@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) break; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return ret; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index e1af20893ebe..6bf587d1a9b8 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -786,12 +786,12 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns, return ERR_PTR(count); if (count == 0) return NULL; - + acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); acl_e = acl->a_entries; - + for (end = entry + count; entry != end; acl_e++, entry++) { acl_e->e_tag = le16_to_cpu(entry->e_tag); acl_e->e_perm = le16_to_cpu(entry->e_perm); diff --git a/fs/proc/array.c b/fs/proc/array.c index ff08a8957552..34a47fb0c57f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, int permitted; struct mm_struct *mm; unsigned long long start_time; - unsigned long cmin_flt = 0, cmaj_flt = 0; - unsigned long min_flt = 0, maj_flt = 0; - u64 cutime, cstime, utime, stime; - u64 cgtime, gtime; + unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt; + u64 cutime, cstime, cgtime, utime, stime, gtime; unsigned long rsslim = 0; unsigned long flags; int exit_code = task->exit_code; + struct signal_struct *sig = task->signal; + unsigned int seq = 1; state = *get_task_state(task); vsize = eip = esp = 0; @@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = 0; - cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { - struct signal_struct *sig = task->signal; - if (sig->tty) { struct pid *pgrp = tty_get_pgrp(sig->tty); tty_pgrp = pid_nr_ns(pgrp, ns); @@ -527,28 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); - cmin_flt = sig->cmin_flt; - cmaj_flt = sig->cmaj_flt; - cutime = sig->cutime; - cstime = sig->cstime; - cgtime = sig->cgtime; rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); - /* add up live thread stats at the group level */ if (whole) { - struct task_struct *t; - - __for_each_thread(sig, t) { - min_flt += t->min_flt; - maj_flt += t->maj_flt; - gtime += task_gtime(t); - } - - min_flt += sig->min_flt; - maj_flt += sig->maj_flt; - thread_group_cputime_adjusted(task, &utime, &stime); - gtime += sig->gtime; - if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) exit_code = sig->group_exit_code; } @@ -562,10 +539,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); - if (!whole) { + + do { + seq++; /* 2 on the 1st/lockless path, otherwise odd */ + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + + if (whole) { + struct task_struct *t; + + min_flt = sig->min_flt; + maj_flt = sig->maj_flt; + gtime = sig->gtime; + + rcu_read_lock(); + __for_each_thread(sig, t) { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } + rcu_read_unlock(); + } + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + + if (whole) { + thread_group_cputime_adjusted(task, &utime, &stime); + } else { + task_cputime_adjusted(task, &utime, &stime); min_flt = task->min_flt; maj_flt = task->maj_flt; - task_cputime_adjusted(task, &utime, &stime); gtime = task_gtime(task); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 98a031ac2648..18550c071d71 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1878,8 +1878,6 @@ void proc_pid_evict_inode(struct proc_inode *ei) hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } - - put_pid(pid); } struct inode *proc_pid_make_inode(struct super_block *sb, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index b33e490e3fd9..dcd513dccf55 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -30,7 +30,6 @@ static void proc_evict_inode(struct inode *inode) { - struct proc_dir_entry *de; struct ctl_table_header *head; struct proc_inode *ei = PROC_I(inode); @@ -38,17 +37,8 @@ static void proc_evict_inode(struct inode *inode) clear_inode(inode); /* Stop tracking associated processes */ - if (ei->pid) { + if (ei->pid) proc_pid_evict_inode(ei); - ei->pid = NULL; - } - - /* Let go of any associated proc directory entry */ - de = ei->pde; - if (de) { - pde_put(de); - ei->pde = NULL; - } head = ei->sysctl; if (head) { @@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb) static void proc_free_inode(struct inode *inode) { + struct proc_inode *ei = PROC_I(inode); + + if (ei->pid) + put_pid(ei->pid); + /* Let go of any associated proc directory entry */ + if (ei->pde) + pde_put(ei->pde); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } @@ -95,7 +92,7 @@ void __init proc_init_kmemcache(void) proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_ACCOUNT| SLAB_PANIC), init_once); pde_opener_cache = diff --git a/fs/proc/root.c b/fs/proc/root.c index b55dbc70287b..06a297a27ba3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -271,7 +271,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); put_pid_ns(fs_info->pid_ns); - kfree(fs_info); + kfree_rcu(fs_info, rcu); } static struct file_system_type proc_fs_type = { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 62b16f42d5d2..3f78ebbb795f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2432,7 +2432,6 @@ static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) { - struct mmu_notifier_range range; struct pagemap_scan_private p = {0}; unsigned long walk_start; size_t n_ranges_out = 0; @@ -2448,15 +2447,9 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) if (ret) return ret; - /* Protection change for the range is going to happen. */ - if (p.arg.flags & PM_SCAN_WP_MATCHING) { - mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, - mm, p.arg.start, p.arg.end); - mmu_notifier_invalidate_range_start(&range); - } - for (walk_start = p.arg.start; walk_start < p.arg.end; walk_start = p.arg.walk_end) { + struct mmu_notifier_range range; long n_out; if (fatal_signal_pending(current)) { @@ -2467,8 +2460,20 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) ret = mmap_read_lock_killable(mm); if (ret) break; + + /* Protection change for the range is going to happen. */ + if (p.arg.flags & PM_SCAN_WP_MATCHING) { + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, + mm, walk_start, p.arg.end); + mmu_notifier_invalidate_range_start(&range); + } + ret = walk_page_range(mm, walk_start, p.arg.end, &pagemap_scan_ops, &p); + + if (p.arg.flags & PM_SCAN_WP_MATCHING) + mmu_notifier_invalidate_range_end(&range); + mmap_read_unlock(mm); n_out = pagemap_scan_flush_buffer(&p); @@ -2494,9 +2499,6 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) if (pagemap_scan_writeback_args(&p.arg, uarg)) ret = -EFAULT; - if (p.arg.flags & PM_SCAN_WP_MATCHING) - mmu_notifier_invalidate_range_end(&range); - kfree(p.vec_buf); return ret; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 6eb9bb369b57..7b5711f76709 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -21,6 +21,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/statfs.h> +#include <linux/fs_context.h> #include "qnx4.h" #define QNX4_VERSION 4 @@ -30,28 +31,33 @@ static const struct super_operations qnx4_sops; static struct inode *qnx4_alloc_inode(struct super_block *sb); static void qnx4_free_inode(struct inode *inode); -static int qnx4_remount(struct super_block *sb, int *flags, char *data); static int qnx4_statfs(struct dentry *, struct kstatfs *); +static int qnx4_get_tree(struct fs_context *fc); static const struct super_operations qnx4_sops = { .alloc_inode = qnx4_alloc_inode, .free_inode = qnx4_free_inode, .statfs = qnx4_statfs, - .remount_fs = qnx4_remount, }; -static int qnx4_remount(struct super_block *sb, int *flags, char *data) +static int qnx4_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; struct qnx4_sb_info *qs; sync_filesystem(sb); qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; return 0; } +static const struct fs_context_operations qnx4_context_opts = { + .get_tree = qnx4_get_tree, + .reconfigure = qnx4_reconfigure, +}; + static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create ) { unsigned long phys; @@ -183,12 +189,13 @@ static const char *qnx4_checkroot(struct super_block *sb, return "bitmap file not found."; } -static int qnx4_fill_super(struct super_block *s, void *data, int silent) +static int qnx4_fill_super(struct super_block *s, struct fs_context *fc) { struct buffer_head *bh; struct inode *root; const char *errmsg; struct qnx4_sb_info *qs; + int silent = fc->sb_flags & SB_SILENT; qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); if (!qs) @@ -216,7 +223,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data); brelse(bh); if (errmsg != NULL) { - if (!silent) + if (!silent) printk(KERN_ERR "qnx4: %s\n", errmsg); return -EINVAL; } @@ -235,6 +242,18 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) return 0; } +static int qnx4_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, qnx4_fill_super); +} + +static int qnx4_init_fs_context(struct fs_context *fc) +{ + fc->ops = &qnx4_context_opts; + + return 0; +} + static void qnx4_kill_sb(struct super_block *sb) { struct qnx4_sb_info *qs = qnx4_sb(sb); @@ -376,18 +395,12 @@ static void destroy_inodecache(void) kmem_cache_destroy(qnx4_inode_cachep); } -static struct dentry *qnx4_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super); -} - static struct file_system_type qnx4_fs_type = { - .owner = THIS_MODULE, - .name = "qnx4", - .mount = qnx4_mount, - .kill_sb = qnx4_kill_sb, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "qnx4", + .kill_sb = qnx4_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = qnx4_init_fs_context, }; MODULE_ALIAS_FS("qnx4"); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index a286c545717f..405913f4faff 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -615,7 +615,7 @@ static int init_inodecache(void) qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 171c912af50f..6474529c4253 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2386,7 +2386,7 @@ static int journal_read(struct super_block *sb) cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); reiserfs_info(sb, "checking transaction log (%pg)\n", - journal->j_bdev_handle->bdev); + file_bdev(journal->j_bdev_file)); start = ktime_get_seconds(); /* @@ -2447,7 +2447,7 @@ static int journal_read(struct super_block *sb) * device and journal device to be the same */ d_bh = - reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock, + reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock, sb->s_blocksize, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb)); @@ -2588,9 +2588,9 @@ static void journal_list_init(struct super_block *sb) static void release_journal_dev(struct reiserfs_journal *journal) { - if (journal->j_bdev_handle) { - bdev_release(journal->j_bdev_handle); - journal->j_bdev_handle = NULL; + if (journal->j_bdev_file) { + fput(journal->j_bdev_file); + journal->j_bdev_file = NULL; } } @@ -2605,7 +2605,7 @@ static int journal_init_dev(struct super_block *super, result = 0; - journal->j_bdev_handle = NULL; + journal->j_bdev_file = NULL; jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; @@ -2616,37 +2616,37 @@ static int journal_init_dev(struct super_block *super, if ((!jdev_name || !jdev_name[0])) { if (jdev == super->s_dev) holder = NULL; - journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode, + journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode, holder, NULL); - if (IS_ERR(journal->j_bdev_handle)) { - result = PTR_ERR(journal->j_bdev_handle); - journal->j_bdev_handle = NULL; + if (IS_ERR(journal->j_bdev_file)) { + result = PTR_ERR(journal->j_bdev_file); + journal->j_bdev_file = NULL; reiserfs_warning(super, "sh-458", "cannot init journal device unknown-block(%u,%u): %i", MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) - set_blocksize(journal->j_bdev_handle->bdev, + set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize); return 0; } - journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode, + journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode, holder, NULL); - if (IS_ERR(journal->j_bdev_handle)) { - result = PTR_ERR(journal->j_bdev_handle); - journal->j_bdev_handle = NULL; + if (IS_ERR(journal->j_bdev_file)) { + result = PTR_ERR(journal->j_bdev_file); + journal->j_bdev_file = NULL; reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; } - set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize); + set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize); reiserfs_info(super, "journal_init_dev: journal device: %pg\n", - journal->j_bdev_handle->bdev); + file_bdev(journal->j_bdev_file)); return 0; } @@ -2804,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, "journal header magic %x (device %pg) does " "not match to magic found in super block %x", jh->jh_journal.jp_journal_magic, - journal->j_bdev_handle->bdev, + file_bdev(journal->j_bdev_file), sb_jp_journal_magic(rs)); brelse(bhjh); goto free_and_return; @@ -2828,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, reiserfs_info(sb, "journal params: device %pg, size %u, " "journal first block %u, max trans len %u, max batch %u, " "max commit age %u, max trans age %u\n", - journal->j_bdev_handle->bdev, + file_bdev(journal->j_bdev_file), SB_ONDISK_JOURNAL_SIZE(sb), SB_ONDISK_JOURNAL_1st_BLOCK(sb), journal->j_trans_max, diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 83cb9402e0f9..5c68a4a52d78 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused) "prepare: \t%12lu\n" "prepare_retry: \t%12lu\n", DJP(jp_journal_1st_block), - SB_JOURNAL(sb)->j_bdev_handle->bdev, + file_bdev(SB_JOURNAL(sb)->j_bdev_file), DJP(jp_journal_dev), DJP(jp_journal_size), DJP(jp_journal_trans_max), diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 725667880e62..0554903f42a9 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -299,7 +299,7 @@ struct reiserfs_journal { /* oldest journal block. start here for traverse */ struct reiserfs_journal_cnode *j_first; - struct bdev_handle *j_bdev_handle; + struct file *j_bdev_file; /* first block on s_dev of reserved area journal */ int j_1st_reserved_block; @@ -2810,10 +2810,10 @@ struct reiserfs_journal_header { /* We need these to make journal.c code more readable */ #define journal_find_get_block(s, block) __find_get_block(\ - SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize) -#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\ + file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize) +#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\ block, s->s_blocksize) -#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\ +#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\ block, s->s_blocksize) enum reiserfs_bh_state_bits { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 67b5510beded..2cc469d481a2 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -670,7 +670,6 @@ static int __init init_inodecache(void) sizeof(struct reiserfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD| SLAB_ACCOUNT), init_once); if (reiserfs_inode_cachep == NULL) diff --git a/fs/remap_range.c b/fs/remap_range.c index f8c1120b8311..de07f978ce3e 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -373,9 +373,9 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(generic_remap_file_range_prep); -loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { loff_t ret; @@ -391,23 +391,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; - ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, remap_flags); - if (ret < 0) - return ret; - - fsnotify_access(file_in); - fsnotify_modify(file_out); - return ret; -} -EXPORT_SYMBOL(do_clone_file_range); - -loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; @@ -417,10 +400,14 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, return ret; file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, - remap_flags); + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, remap_flags); file_end_write(file_out); + if (ret < 0) + return ret; + fsnotify_access(file_in); + fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(vfs_clone_file_range); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 545ad44f96b8..2be227532f39 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb) #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); - bdev_release(sb->s_bdev_handle); + fput(sb->s_bdev_file); } #endif } @@ -630,8 +630,8 @@ static int __init init_romfs_fs(void) romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_ACCOUNT, romfs_i_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, + romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); diff --git a/fs/select.c b/fs/select.c index 0ee55af1a55c..9515c3fa1a03 100644 --- a/fs/select.c +++ b/fs/select.c @@ -476,7 +476,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) +static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) struct poll_list { struct poll_list *next; - int len; + unsigned int len; struct pollfd entries[]; }; @@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; - int err = -EFAULT, fdcount, len; + int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; - unsigned long todo = nfds; + unsigned int todo = nfds; + unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; @@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, sizeof(struct pollfd) * walk->len)) goto out_fds; - todo -= walk->len; - if (!todo) + if (walk->len >= todo) break; + todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), @@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; - int j; + unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index d64a306a414b..3de5047a7ff9 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -145,21 +145,27 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cached_fid *cfid; struct cached_fids *cfids; const char *npath; + int retries = 0, cur_sleep = 1; if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0)) return -EOPNOTSUPP; ses = tcon->ses; - server = ses->server; cfids = tcon->cfids; - if (!server->ops->new_lease_key) - return -EIO; - if (cifs_sb->root == NULL) return -ENOENT; +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_II; + server = cifs_pick_channel(ses); + + if (!server->ops->new_lease_key) + return -EIO; + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) return -ENOMEM; @@ -236,6 +242,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES, .disposition = FILE_OPEN, .fid = pfid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -268,6 +275,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, */ cfid->has_lease = true; + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + } + rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); @@ -367,6 +379,11 @@ out: atomic_inc(&tcon->num_remote_opens); } kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 60027f5aebe8..3e4209f41c18 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -659,6 +659,7 @@ static ssize_t cifs_stats_proc_write(struct file *file, spin_lock(&tcon->stat_lock); tcon->bytes_read = 0; tcon->bytes_written = 0; + tcon->stats_from_time = ktime_get_real_seconds(); spin_unlock(&tcon->stat_lock); if (server->ops->clear_stats) server->ops->clear_stats(tcon); @@ -737,8 +738,9 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) seq_printf(m, "\n%d) %s", i, tcon->tree_name); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); - seq_printf(m, "\nSMBs: %d", - atomic_read(&tcon->num_smbs_sent)); + seq_printf(m, "\nSMBs: %d since %ptTs UTC", + atomic_read(&tcon->num_smbs_sent), + &tcon->stats_from_time); if (server->ops->print_stats) server->ops->print_stats(m, tcon); } diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index ef4c2e3c9fa6..6322f0f68a17 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -572,7 +572,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp); UniStrupr(user); } else { - memset(user, '\0', 2); + *(u16 *)user = 0; } rc = crypto_shash_update(ses->server->secmech.hmacmd5, diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 99b0ade833aa..fb368b191eef 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -396,7 +396,7 @@ cifs_alloc_inode(struct super_block *sb) spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ - cifs_inode->server_eof = 0; + cifs_inode->netfs.remote_i_size = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; cifs_inode->epoch = 0; @@ -430,7 +430,7 @@ static void cifs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_FSCACHE_WB) + if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); cifs_fscache_release_inode_cookie(inode); clear_inode(inode); @@ -681,6 +681,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize); if (tcon->ses->server->min_offload) seq_printf(s, ",esize=%u", tcon->ses->server->min_offload); + if (tcon->ses->server->retrans) + seq_printf(s, ",retrans=%u", tcon->ses->server->retrans); seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); @@ -793,8 +795,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc) { - fscache_unpin_writeback(wbc, cifs_inode_cookie(inode)); - return 0; + return netfs_unpin_writeback(inode, wbc); } static int cifs_drop_inode(struct inode *inode) @@ -1084,7 +1085,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) } static int -cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) +cifs_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { /* * Note that this is called by vfs setlease with i_lock held to @@ -1093,9 +1094,6 @@ cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; - if (!(S_ISREG(inode->i_mode))) - return -EINVAL; - /* Check if file is oplocked if this is request for new lease */ if (arg == F_UNLCK || ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || @@ -1171,6 +1169,9 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode, { char *target_path; + if (!dentry) + return ERR_PTR(-ECHILD); + target_path = kmalloc(PATH_MAX, GFP_KERNEL); if (!target_path) return ERR_PTR(-ENOMEM); @@ -1222,7 +1223,7 @@ static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *s if (rc < 0) goto set_failed; - netfs_resize_file(&src_cifsi->netfs, src_end); + netfs_resize_file(&src_cifsi->netfs, src_end, true); fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end); return 0; @@ -1353,7 +1354,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, smb_file_src, smb_file_target, off, len, destoff); if (rc == 0 && new_size > i_size_read(target_inode)) { truncate_setsize(target_inode, new_size); - netfs_resize_file(&target_cifsi->netfs, new_size); + netfs_resize_file(&target_cifsi->netfs, new_size, true); fscache_resize_cookie(cifs_inode_cookie(target_inode), new_size); } @@ -1379,6 +1380,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode); + struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode); struct cifsFileInfo *smb_file_src; struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; @@ -1427,7 +1429,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->server_eof < off + len) { + if (src_cifsi->netfs.remote_i_size < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1451,12 +1453,22 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, /* Discard all the folios that overlap the destination region. */ truncate_inode_pages_range(&target_inode->i_data, fstart, fend); + fscache_invalidate(cifs_inode_cookie(target_inode), NULL, + i_size_read(target_inode), 0); + rc = file_modified(dst_file); if (!rc) { rc = target_tcon->ses->server->ops->copychunk_range(xid, smb_file_src, smb_file_target, off, len, destoff); - if (rc > 0 && destoff + rc > i_size_read(target_inode)) + if (rc > 0 && destoff + rc > i_size_read(target_inode)) { truncate_setsize(target_inode, destoff + rc); + netfs_resize_file(&target_cifsi->netfs, + i_size_read(target_inode), true); + fscache_resize_cookie(cifs_inode_cookie(target_inode), + i_size_read(target_inode)); + } + if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point) + target_cifsi->netfs.zero_point = destoff + rc; } file_accessed(src_file); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 879d5ef8a66e..53c75cfb33ab 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -50,6 +50,11 @@ #define CIFS_DEF_ACTIMEO (1 * HZ) /* + * max sleep time before retry to server + */ +#define CIFS_MAX_SLEEP 2000 + +/* * max attribute cache timeout (jiffies) - 2^30 */ #define CIFS_MAX_ACTIMEO (1 << 30) @@ -82,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 5 +#define MAX_COMPOUND 7 /* * Default number of credits to keep available for SMB3. @@ -204,6 +209,8 @@ struct cifs_open_info_data { }; } reparse; char *symlink_target; + struct cifs_sid posix_owner; + struct cifs_sid posix_group; union { struct smb2_file_all_info fi; struct smb311_posix_qinfo posix_fi; @@ -751,6 +758,7 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; unsigned int min_offload; + unsigned int retrans; __le16 compress_algorithm; __u16 signing_algorithm; __le16 cipher_type; @@ -1024,6 +1032,8 @@ struct cifs_chan { __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; +#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1) + /* * Session structure. One of these for each uid session with a particular host */ @@ -1056,6 +1066,7 @@ struct cifs_ses { enum securityEnum sectype; /* what security flavor was specified? */ bool sign; /* is signing required? */ bool domainAuto:1; + unsigned int flags; __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; @@ -1207,6 +1218,7 @@ struct cifs_tcon { __u64 bytes_read; __u64 bytes_written; spinlock_t stat_lock; /* protects the two fields above */ + time64_t stats_from_time; FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ FILE_SYSTEM_UNIX_INFO fsUnixInfo; @@ -1366,6 +1378,7 @@ struct cifs_open_parms { struct cifs_fid *fid; umode_t mode; bool reconnect:1; + bool replay:1; /* indicates that this open is for a replay */ }; struct cifs_fid { @@ -1497,6 +1510,7 @@ struct cifs_writedata { struct smbd_mr *mr; #endif struct cifs_credits credits; + bool replay; }; /* @@ -1557,7 +1571,6 @@ struct cifsInodeInfo { spinlock_t writers_lock; unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ - u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ u64 createtime; /* creation time on server */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */ @@ -1827,6 +1840,13 @@ static inline bool is_retryable_error(int error) return false; } +static inline bool is_replayable_error(int error) +{ + if (error == -EAGAIN || error == -ECONNABORTED) + return true; + return false; +} + /* cifs_get_writable_file() flags */ #define FIND_WR_ANY 0 diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 01e89070df5a..5eb83bafc7fd 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -2066,20 +2066,20 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, parm_data = (struct cifs_posix_lock *) ((char *)&pSMBr->hdr.Protocol + data_offset); if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) - pLockData->fl_type = F_UNLCK; + pLockData->c.flc_type = F_UNLCK; else { if (parm_data->lock_type == cpu_to_le16(CIFS_RDLCK)) - pLockData->fl_type = F_RDLCK; + pLockData->c.flc_type = F_RDLCK; else if (parm_data->lock_type == cpu_to_le16(CIFS_WRLCK)) - pLockData->fl_type = F_WRLCK; + pLockData->c.flc_type = F_WRLCK; pLockData->fl_start = le64_to_cpu(parm_data->start); pLockData->fl_end = pLockData->fl_start + (le64_to_cpu(parm_data->length) ? le64_to_cpu(parm_data->length) - 1 : 0); - pLockData->fl_pid = -le32_to_cpu(parm_data->pid); + pLockData->c.flc_pid = -le32_to_cpu(parm_data->pid); } } diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 3052a208c6ca..ac9595504f4b 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -233,6 +233,12 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { /* check if iface is still active */ spin_lock(&ses->chan_lock); + if (cifs_ses_get_chan_index(ses, server) == + CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + continue; + } + if (!cifs_chan_is_iface_active(ses, server)) { spin_unlock(&ses->chan_lock); cifs_chan_update_iface(ses, server); @@ -1574,6 +1580,9 @@ static int match_server(struct TCP_Server_Info *server, if (server->min_offload != ctx->min_offload) return 0; + if (server->retrans != ctx->retrans) + return 0; + return 1; } @@ -1798,6 +1807,7 @@ smbd_connected: goto out_err_crypto_release; } tcp_ses->min_offload = ctx->min_offload; + tcp_ses->retrans = ctx->retrans; /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet @@ -3434,8 +3444,18 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) * the user on mount */ if ((cifs_sb->ctx->wsize == 0) || - (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) - cifs_sb->ctx->wsize = server->ops->negotiate_wsize(tcon, ctx); + (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) { + cifs_sb->ctx->wsize = + round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE); + /* + * in the very unlikely event that the server sent a max write size under PAGE_SIZE, + * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096 + */ + if (cifs_sb->ctx->wsize == 0) { + cifs_sb->ctx->wsize = PAGE_SIZE; + cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n"); + } + } if ((cifs_sb->ctx->rsize == 0) || (cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx))) cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx); @@ -4224,6 +4244,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + + /* if tcon is marked for needing reconnect, update state */ + if (tcon->need_reconnect) + tcon->status = TID_NEED_TCON; + if (tcon->status == TID_GOOD) { spin_unlock(&tcon->tc_lock); return 0; diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index a8a1d386da65..449c59830039 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -565,6 +565,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + + /* if tcon is marked for needing reconnect, update state */ + if (tcon->need_reconnect) + tcon->status = TID_NEED_TCON; + if (tcon->status == TID_GOOD) { spin_unlock(&tcon->tc_lock); return 0; @@ -625,8 +630,8 @@ out: spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) tcon->status = TID_GOOD; - spin_unlock(&tcon->tc_lock); tcon->need_reconnect = false; + spin_unlock(&tcon->tc_lock); } return rc; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 1b4262aff8fa..c3b8e7091a4d 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le xas_for_each(&xas, folio, end) { if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -175,6 +175,9 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) /* only send once per connect */ spin_lock(&tcon->tc_lock); + if (tcon->need_reconnect) + tcon->status = TID_NEED_RECON; + if (tcon->status != TID_NEED_RECON) { spin_unlock(&tcon->tc_lock); return; @@ -1312,20 +1315,20 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, down_read(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, offset, length, type, - flock->fl_flags, &conf_lock, + flock->c.flc_flags, &conf_lock, CIFS_LOCK_OP); if (exist) { flock->fl_start = conf_lock->offset; flock->fl_end = conf_lock->offset + conf_lock->length - 1; - flock->fl_pid = conf_lock->pid; + flock->c.flc_pid = conf_lock->pid; if (conf_lock->type & server->vals->shared_lock_type) - flock->fl_type = F_RDLCK; + flock->c.flc_type = F_RDLCK; else - flock->fl_type = F_WRLCK; + flock->c.flc_type = F_WRLCK; } else if (!cinode->can_cache_brlcks) rc = 1; else - flock->fl_type = F_UNLCK; + flock->c.flc_type = F_UNLCK; up_read(&cinode->lock_sem); return rc; @@ -1401,16 +1404,16 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) { int rc = 0; struct cifsInodeInfo *cinode = CIFS_I(file_inode(file)); - unsigned char saved_type = flock->fl_type; + unsigned char saved_type = flock->c.flc_type; - if ((flock->fl_flags & FL_POSIX) == 0) + if ((flock->c.flc_flags & FL_POSIX) == 0) return 1; down_read(&cinode->lock_sem); posix_test_lock(file, flock); - if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) { - flock->fl_type = saved_type; + if (lock_is_unlock(flock) && !cinode->can_cache_brlcks) { + flock->c.flc_type = saved_type; rc = 1; } @@ -1431,7 +1434,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock) struct cifsInodeInfo *cinode = CIFS_I(file_inode(file)); int rc = FILE_LOCK_DEFERRED + 1; - if ((flock->fl_flags & FL_POSIX) == 0) + if ((flock->c.flc_flags & FL_POSIX) == 0) return rc; cifs_down_write(&cinode->lock_sem); @@ -1581,7 +1584,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) el = locks_to_send.next; spin_lock(&flctx->flc_lock); - list_for_each_entry(flock, &flctx->flc_posix, fl_list) { + for_each_file_lock(flock, &flctx->flc_posix) { + unsigned char ftype = flock->c.flc_type; + if (el == &locks_to_send) { /* * The list ended. We don't have enough allocated @@ -1591,12 +1596,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) break; } length = cifs_flock_len(flock); - if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) + if (ftype == F_RDLCK || ftype == F_SHLCK) type = CIFS_RDLCK; else type = CIFS_WRLCK; lck = list_entry(el, struct lock_to_push, llist); - lck->pid = hash_lockowner(flock->fl_owner); + lck->pid = hash_lockowner(flock->c.flc_owner); lck->netfid = cfile->fid.netfid; lck->length = length; lck->type = type; @@ -1663,42 +1668,43 @@ static void cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock, bool *wait_flag, struct TCP_Server_Info *server) { - if (flock->fl_flags & FL_POSIX) + if (flock->c.flc_flags & FL_POSIX) cifs_dbg(FYI, "Posix\n"); - if (flock->fl_flags & FL_FLOCK) + if (flock->c.flc_flags & FL_FLOCK) cifs_dbg(FYI, "Flock\n"); - if (flock->fl_flags & FL_SLEEP) { + if (flock->c.flc_flags & FL_SLEEP) { cifs_dbg(FYI, "Blocking lock\n"); *wait_flag = true; } - if (flock->fl_flags & FL_ACCESS) + if (flock->c.flc_flags & FL_ACCESS) cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n"); - if (flock->fl_flags & FL_LEASE) + if (flock->c.flc_flags & FL_LEASE) cifs_dbg(FYI, "Lease on file - not implemented yet\n"); - if (flock->fl_flags & + if (flock->c.flc_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK))) - cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags); + cifs_dbg(FYI, "Unknown lock flags 0x%x\n", + flock->c.flc_flags); *type = server->vals->large_lock_type; - if (flock->fl_type == F_WRLCK) { + if (lock_is_write(flock)) { cifs_dbg(FYI, "F_WRLCK\n"); *type |= server->vals->exclusive_lock_type; *lock = 1; - } else if (flock->fl_type == F_UNLCK) { + } else if (lock_is_unlock(flock)) { cifs_dbg(FYI, "F_UNLCK\n"); *type |= server->vals->unlock_lock_type; *unlock = 1; /* Check if unlock includes more than one lock range */ - } else if (flock->fl_type == F_RDLCK) { + } else if (lock_is_read(flock)) { cifs_dbg(FYI, "F_RDLCK\n"); *type |= server->vals->shared_lock_type; *lock = 1; - } else if (flock->fl_type == F_EXLCK) { + } else if (flock->c.flc_type == F_EXLCK) { cifs_dbg(FYI, "F_EXLCK\n"); *type |= server->vals->exclusive_lock_type; *lock = 1; - } else if (flock->fl_type == F_SHLCK) { + } else if (flock->c.flc_type == F_SHLCK) { cifs_dbg(FYI, "F_SHLCK\n"); *type |= server->vals->shared_lock_type; *lock = 1; @@ -1730,7 +1736,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, else posix_lock_type = CIFS_WRLCK; rc = CIFSSMBPosixLock(xid, tcon, netfid, - hash_lockowner(flock->fl_owner), + hash_lockowner(flock->c.flc_owner), flock->fl_start, length, flock, posix_lock_type, wait_flag); return rc; @@ -1747,7 +1753,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, if (rc == 0) { rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type, 0, 1, false); - flock->fl_type = F_UNLCK; + flock->c.flc_type = F_UNLCK; if (rc != 0) cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", rc); @@ -1755,7 +1761,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, } if (type & server->vals->shared_lock_type) { - flock->fl_type = F_WRLCK; + flock->c.flc_type = F_WRLCK; return 0; } @@ -1767,12 +1773,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, if (rc == 0) { rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type | server->vals->shared_lock_type, 0, 1, false); - flock->fl_type = F_RDLCK; + flock->c.flc_type = F_RDLCK; if (rc != 0) cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", rc); } else - flock->fl_type = F_WRLCK; + flock->c.flc_type = F_WRLCK; return 0; } @@ -1940,7 +1946,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, posix_lock_type = CIFS_UNLCK; rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid, - hash_lockowner(flock->fl_owner), + hash_lockowner(flock->c.flc_owner), flock->fl_start, length, NULL, posix_lock_type, wait_flag); goto out; @@ -1950,7 +1956,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsLockInfo *lock; lock = cifs_lock_init(flock->fl_start, length, type, - flock->fl_flags); + flock->c.flc_flags); if (!lock) return -ENOMEM; @@ -1989,7 +1995,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if ((flock->fl_flags & FL_POSIX) || (flock->fl_flags & FL_FLOCK)) { + if ((flock->c.flc_flags & FL_POSIX) || (flock->c.flc_flags & FL_FLOCK)) { /* * If this is a request to remove all locks because we * are closing the file, it doesn't matter if the @@ -1998,7 +2004,7 @@ out: */ if (rc) { cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc); - if (!(flock->fl_flags & FL_CLOSE)) + if (!(flock->c.flc_flags & FL_CLOSE)) return rc; } rc = locks_lock_file_wait(file, flock); @@ -2019,7 +2025,7 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) xid = get_xid(); - if (!(fl->fl_flags & FL_FLOCK)) { + if (!(fl->c.flc_flags & FL_FLOCK)) { rc = -ENOLCK; free_xid(xid); return rc; @@ -2070,7 +2076,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) xid = get_xid(); cifs_dbg(FYI, "%s: %pD2 cmd=0x%x type=0x%x flags=0x%x r=%lld:%lld\n", __func__, file, cmd, - flock->fl_flags, flock->fl_type, (long long)flock->fl_start, + flock->c.flc_flags, flock->c.flc_type, + (long long)flock->fl_start, (long long)flock->fl_end); cfile = (struct cifsFileInfo *)file->private_data; @@ -2120,8 +2127,8 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, { loff_t end_of_write = offset + bytes_written; - if (end_of_write > cifsi->server_eof) - cifsi->server_eof = end_of_write; + if (end_of_write > cifsi->netfs.remote_i_size) + netfs_resize_file(&cifsi->netfs, end_of_write, true); } static ssize_t @@ -2651,7 +2658,7 @@ static void cifs_extend_writeback(struct address_space *mapping, continue; if (xa_is_value(folio)) break; - if (folio_index(folio) != index) + if (folio->index != index) break; if (!folio_try_get_rcu(folio)) { xas_reset(&xas); @@ -2899,7 +2906,7 @@ redo_folio: goto skip_write; } - if (folio_mapping(folio) != mapping || + if (folio->mapping != mapping || !folio_test_dirty(folio)) { start += folio_size(folio); folio_unlock(folio); @@ -2951,7 +2958,7 @@ skip_write: continue; } - folio_batch_release(&fbatch); + folio_batch_release(&fbatch); cond_resched(); } while (wbc->nr_to_write > 0); @@ -3247,8 +3254,8 @@ cifs_uncached_writev_complete(struct work_struct *work) spin_lock(&inode->i_lock); cifs_update_eof(cifsi, wdata->offset, wdata->bytes); - if (cifsi->server_eof > inode->i_size) - i_size_write(inode, cifsi->server_eof); + if (cifsi->netfs.remote_i_size > inode->i_size) + i_size_write(inode, cifsi->netfs.remote_i_size); spin_unlock(&inode->i_lock); complete(&wdata->done); @@ -3300,6 +3307,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, if (wdata->cfile->invalidHandle) rc = -EAGAIN; else { + wdata->replay = true; #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) { wdata->mr->need_invalidate = true; @@ -5043,27 +5051,13 @@ static void cifs_swap_deactivate(struct file *file) /* do we need to unpin (or unlock) the file */ } -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -#ifdef CONFIG_CIFS_FSCACHE -static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - return fscache_dirty_folio(mapping, folio, - cifs_inode_cookie(mapping->host)); -} -#else -#define cifs_dirty_folio filemap_dirty_folio -#endif - const struct address_space_operations cifs_addr_ops = { .read_folio = cifs_read_folio, .readahead = cifs_readahead, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .dirty_folio = cifs_dirty_folio, + .dirty_folio = netfs_dirty_folio, .release_folio = cifs_release_folio, .direct_IO = cifs_direct_io, .invalidate_folio = cifs_invalidate_folio, @@ -5087,7 +5081,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .dirty_folio = cifs_dirty_folio, + .dirty_folio = netfs_dirty_folio, .release_folio = cifs_release_folio, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index a3493da12ad1..4b2f5aa2ea0e 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -139,6 +139,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("dir_mode", Opt_dirmode), fsparam_u32("port", Opt_port), fsparam_u32("min_enc_offload", Opt_min_enc_offload), + fsparam_u32("retrans", Opt_retrans), fsparam_u32("esize", Opt_min_enc_offload), fsparam_u32("bsize", Opt_blocksize), fsparam_u32("rasize", Opt_rasize), @@ -210,7 +211,7 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5p: - cifs_errorf(fc, "sec=krb5p is not supported!\n"); + cifs_errorf(fc, "sec=krb5p is not supported. Use sec=krb5,seal instead\n"); return 1; case Opt_sec_krb5i: ctx->sign = true; @@ -1064,6 +1065,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_min_enc_offload: ctx->min_offload = result.uint_32; break; + case Opt_retrans: + ctx->retrans = result.uint_32; + break; case Opt_blocksize: /* * inode blocksize realistically should never need to be @@ -1107,6 +1111,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_wsize: ctx->wsize = result.uint_32; ctx->got_wsize = true; + if (ctx->wsize % PAGE_SIZE != 0) { + ctx->wsize = round_down(ctx->wsize, PAGE_SIZE); + if (ctx->wsize == 0) { + ctx->wsize = PAGE_SIZE; + cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE); + } else { + cifs_dbg(VFS, + "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n", + ctx->wsize, PAGE_SIZE); + } + } break; case Opt_acregmax: ctx->acregmax = HZ * result.uint_32; @@ -1619,6 +1634,8 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->backupuid_specified = false; /* no backup intent for a user */ ctx->backupgid_specified = false; /* no backup intent for a group */ + ctx->retrans = 1; + /* * short int override_uid = -1; * short int override_gid = -1; diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index cf46916286d0..182ce11cbe93 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -118,6 +118,7 @@ enum cifs_param { Opt_file_mode, Opt_dirmode, Opt_min_enc_offload, + Opt_retrans, Opt_blocksize, Opt_rasize, Opt_rsize, @@ -245,6 +246,7 @@ struct smb3_fs_context { unsigned int rsize; unsigned int wsize; unsigned int min_offload; + unsigned int retrans; bool sockopt_tcp_nodelay:1; /* attribute cache timemout for files and directories in jiffies */ unsigned long acregmax; diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index e5cad149f5a2..c4a3cb736881 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -180,7 +180,7 @@ static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_ if (ret < 0) return ret; - ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), + ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode), no_space_allocated_yet); if (ret == 0) ret = fscache_write(&cres, start, &iter, NULL, NULL); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 9f37c1758f73..d02f8ba29cb5 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -104,7 +104,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); mtime = inode_get_mtime(inode); if (timespec64_equal(&mtime, &fattr->cf_mtime) && - cifs_i->server_eof == fattr->cf_eof) { + cifs_i->netfs.remote_i_size == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); return; @@ -194,7 +194,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) else clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); - cifs_i->server_eof = fattr->cf_eof; + cifs_i->netfs.remote_i_size = fattr->cf_eof; /* * Can't safely change the file size here if the client is writing to * it due to potential races. @@ -665,8 +665,6 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from POSIX info struct */ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group, struct super_block *sb) { struct smb311_posix_qinfo *info = &data->posix_fi; @@ -722,8 +720,8 @@ out_reparse: fattr->cf_symlink_target = data->symlink_target; data->symlink_target = NULL; } - sid_to_id(cifs_sb, owner, fattr, SIDOWNER); - sid_to_id(cifs_sb, group, fattr, SIDGROUP); + sid_to_id(cifs_sb, &data->posix_owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, &data->posix_group, fattr, SIDGROUP); cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n", fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); @@ -1070,9 +1068,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, - struct cifs_fattr *fattr, - struct cifs_sid *owner, - struct cifs_sid *group) + struct cifs_fattr *fattr) { struct TCP_Server_Info *server = tcon->ses->server; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -1117,7 +1113,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, } if (tcon->posix_extensions) - smb311_posix_info_to_fattr(fattr, data, owner, group, sb); + smb311_posix_info_to_fattr(fattr, data, sb); else cifs_open_info_to_fattr(fattr, data, sb); out: @@ -1171,8 +1167,7 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, */ if (cifs_open_data_reparse(data)) { rc = reparse_info_to_fattr(data, sb, xid, tcon, - full_path, fattr, - NULL, NULL); + full_path, fattr); } else { cifs_open_info_to_fattr(fattr, data, sb); } @@ -1317,10 +1312,10 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, const unsigned int xid) { struct cifs_open_info_data tmp_data = {}; + struct TCP_Server_Info *server; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct tcon_link *tlink; - struct cifs_sid owner, group; int tmprc; int rc = 0; @@ -1328,14 +1323,14 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; /* * 1. Fetch file metadata if not provided (data) */ if (!data) { - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, - full_path, &tmp_data, - &owner, &group); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, + full_path, &tmp_data); data = &tmp_data; } @@ -1347,11 +1342,9 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, case 0: if (cifs_open_data_reparse(data)) { rc = reparse_info_to_fattr(data, sb, xid, tcon, - full_path, fattr, - &owner, &group); + full_path, fattr); } else { - smb311_posix_info_to_fattr(fattr, data, - &owner, &group, sb); + smb311_posix_info_to_fattr(fattr, data, sb); } break; case -EREMOTE: @@ -2865,7 +2858,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, set_size_out: if (rc == 0) { - cifsInode->server_eof = attrs->ia_size; + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); cifs_setsize(inode, attrs->ia_size); /* * i_blocks is not related to (i_size / i_blksize), but instead @@ -3018,6 +3011,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if ((attrs->ia_valid & ATTR_SIZE) && attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } @@ -3217,6 +3211,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if ((attrs->ia_valid & ATTR_SIZE) && attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index c2137ea3c253..0748d7b757b9 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -140,6 +140,7 @@ tcon_info_alloc(bool dir_leases_enabled) spin_lock_init(&ret_buf->stat_lock); atomic_set(&ret_buf->num_local_opens, 0); atomic_set(&ret_buf->num_remote_opens, 0); + ret_buf->stats_from_time = ktime_get_real_seconds(); #ifdef CONFIG_CIFS_DFS_UPCALL INIT_LIST_HEAD(&ret_buf->dfs_ses_list); #endif diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c index a6968573b775..4a517b280f2b 100644 --- a/fs/smb/client/namespace.c +++ b/fs/smb/client/namespace.c @@ -168,6 +168,21 @@ static char *automount_fullpath(struct dentry *dentry, void *page) return s; } +static void fs_context_set_ids(struct smb3_fs_context *ctx) +{ + kuid_t uid = current_fsuid(); + kgid_t gid = current_fsgid(); + + if (ctx->multiuser) { + if (!ctx->uid_specified) + ctx->linux_uid = uid; + if (!ctx->gid_specified) + ctx->linux_gid = gid; + } + if (!ctx->cruid_specified) + ctx->cred_uid = uid; +} + /* * Create a vfsmount that we can automount */ @@ -205,6 +220,7 @@ static struct vfsmount *cifs_do_automount(struct path *path) tmp.leaf_fullpath = NULL; tmp.UNC = tmp.prepath = NULL; tmp.dfs_root_ses = NULL; + fs_context_set_ids(&tmp); rc = smb3_fs_context_dup(ctx, &tmp); if (rc) { diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 056cae1ddcce..b520eea7bfce 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -133,15 +133,15 @@ retry: * Query dir responses don't provide enough * information about reparse points other than * their reparse tags. Save an invalidation by - * not clobbering the existing mode, size and - * symlink target (if any) when reparse tag and - * ctime haven't changed. + * not clobbering some existing attributes when + * reparse tag and ctime haven't changed. */ rc = 0; if (fattr->cf_cifsattrs & ATTR_REPARSE) { if (likely(reparse_inode_match(inode, fattr))) { fattr->cf_mode = inode->i_mode; - fattr->cf_eof = CIFS_I(inode)->server_eof; + fattr->cf_rdev = inode->i_rdev; + fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size; fattr->cf_symlink_target = NULL; } else { CIFS_I(inode)->time = 0; @@ -307,14 +307,16 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, } static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr, - SEARCH_ID_FULL_DIR_INFO *info, + const void *info, struct cifs_sb_info *cifs_sb) { + const FILE_FULL_DIRECTORY_INFO *di = info; + __dir_info_to_fattr(fattr, info); - /* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */ + /* See MS-FSCC 2.4.14, 2.4.19 */ if (fattr->cf_cifsattrs & ATTR_REPARSE) - fattr->cf_cifstag = le32_to_cpu(info->EaSize); + fattr->cf_cifstag = le32_to_cpu(di->EaSize); cifs_fill_common_info(fattr, cifs_sb); } @@ -396,7 +398,7 @@ ffirst_retry: } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; } else /* not srvinos - BB fixme add check for backlevel? */ { - cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO; + cifsFile->srch_inf.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO; } search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME; @@ -645,10 +647,10 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) static int is_dir_changed(struct file *file) { struct inode *inode = file_inode(file); - struct cifsInodeInfo *cifsInfo = CIFS_I(inode); + struct cifsInodeInfo *cifs_inode_info = CIFS_I(inode); - if (cifsInfo->time == 0) - return 1; /* directory was changed, perhaps due to unlink */ + if (cifs_inode_info->time == 0) + return 1; /* directory was changed, e.g. unlink or new file */ else return 0; @@ -987,10 +989,9 @@ static int cifs_filldir(char *find_entry, struct file *file, (FIND_FILE_STANDARD_INFO *)find_entry, cifs_sb); break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: case SMB_FIND_FILE_ID_FULL_DIR_INFO: - cifs_fulldir_info_to_fattr(&fattr, - (SEARCH_ID_FULL_DIR_INFO *)find_entry, - cifs_sb); + cifs_fulldir_info_to_fattr(&fattr, find_entry, cifs_sb); break; default: cifs_dir_info_to_fattr(&fattr, diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index cde81042bebd..8f37373fd333 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -75,6 +75,10 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, { unsigned int i; + /* if the channel is waiting for termination */ + if (server && server->terminate) + return CIFS_INVAL_CHAN_INDEX; + for (i = 0; i < ses->chan_count; i++) { if (ses->chans[i].server == server) return i; @@ -84,7 +88,6 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, if (server) cifs_dbg(VFS, "unable to get chan index for server: 0x%llx", server->conn_id); - WARN_ON(1); return CIFS_INVAL_CHAN_INDEX; } @@ -269,6 +272,8 @@ int cifs_try_adding_channels(struct cifs_ses *ses) &iface->sockaddr, rc); kref_put(&iface->refcount, release_iface); + /* failure to add chan should increase weight */ + iface->weight_fulfilled++; continue; } diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index e0ee96d69d49..c23478ab1cf8 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -228,7 +228,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, * flock and OFD lock are associated with an open * file description, not the process. */ - if (!(flock->fl_flags & (FL_FLOCK | FL_OFDLCK))) + if (!(flock->c.flc_flags & (FL_FLOCK | FL_OFDLCK))) continue; if (cinode->can_cache_brlcks) { /* diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 5053a5550abe..05818cd6d932 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -56,6 +56,35 @@ static inline __u32 file_create_options(struct dentry *dentry) return 0; } +/* Parse owner and group from SMB3.1.1 POSIX query info */ +static int parse_posix_sids(struct cifs_open_info_data *data, + struct kvec *rsp_iov) +{ + struct smb2_query_info_rsp *qi = rsp_iov->iov_base; + unsigned int out_len = le32_to_cpu(qi->OutputBufferLength); + unsigned int qi_len = sizeof(data->posix_fi); + int owner_len, group_len; + u8 *sidsbuf, *sidsbuf_end; + + if (out_len <= qi_len) + return -EINVAL; + + sidsbuf = (u8 *)qi + le16_to_cpu(qi->OutputBufferOffset) + qi_len; + sidsbuf_end = sidsbuf + out_len - qi_len; + + owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); + if (owner_len == -1) + return -EINVAL; + + memcpy(&data->posix_owner, sidsbuf, owner_len); + group_len = posix_info_sid_size(sidsbuf + owner_len, sidsbuf_end); + if (group_len == -1) + return -EINVAL; + + memcpy(&data->posix_group, sidsbuf + owner_len, group_len); + return 0; +} + /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. @@ -69,7 +98,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, __u32 desired_access, __u32 create_disposition, __u32 create_options, umode_t mode, struct kvec *in_iov, int *cmds, int num_cmds, struct cifsFileInfo *cfile, - __u8 **extbuf, size_t *extbuflen, struct kvec *out_iov, int *out_buftype) { @@ -92,6 +120,14 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, unsigned int size[2]; void *data[2]; int len; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_NONE; + num_rqst = 0; + server = cifs_pick_channel(ses); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -99,8 +135,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst = &vars->rqst[0]; rsp_iov = &vars->rsp_iov[0]; - server = cifs_pick_channel(ses); - if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -435,15 +469,24 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, num_rqst++; if (cfile) { + if (retries) + for (i = 1; i < num_rqst - 2; i++) + smb2_set_replay(server, &rqst[i]); + rc = compound_send_recv(xid, ses, server, flags, num_rqst - 2, &rqst[1], &resp_buftype[1], &rsp_iov[1]); - } else + } else { + if (retries) + for (i = 0; i < num_rqst; i++) + smb2_set_replay(server, &rqst[i]); + rc = compound_send_recv(xid, ses, server, flags, num_rqst, rqst, resp_buftype, rsp_iov); + } finished: num_rqst = 0; @@ -494,21 +537,9 @@ finished: &rsp_iov[i + 1], sizeof(idata->posix_fi) /* add SIDs */, (char *)&idata->posix_fi); } - if (rc == 0) { - unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength); - - if (length > sizeof(idata->posix_fi)) { - char *base = (char *)rsp_iov[i + 1].iov_base + - le16_to_cpu(qi_rsp->OutputBufferOffset) + - sizeof(idata->posix_fi); - *extbuflen = length - sizeof(idata->posix_fi); - *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL); - if (!*extbuf) - rc = -ENOMEM; - } else { - rc = -EINVAL; - } - } + if (rc == 0) + rc = parse_posix_sids(idata, &rsp_iov[i + 1]); + SMB2_query_info_free(&rqst[num_rqst++]); if (rc) trace_smb3_posix_query_info_compound_err(xid, ses->Suid, @@ -604,9 +635,6 @@ finished: } SMB2_close_free(&rqst[num_rqst]); - if (cfile) - cifsFileInfo_put(cfile); - num_cmds += 2; if (out_iov && out_buftype) { memcpy(out_iov, rsp_iov, num_cmds * sizeof(*out_iov)); @@ -616,7 +644,16 @@ finished: for (i = 0; i < num_cmds; i++) free_rsp_buf(resp_buftype[i], rsp_iov[i].iov_base); } + num_cmds -= 2; /* correct num_cmds as there could be a retry */ kfree(vars); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + + if (cfile) + cifsFileInfo_put(cfile); + return rc; } @@ -662,7 +699,7 @@ int smb2_query_path_info(const unsigned int xid, struct smb2_hdr *hdr; struct kvec in_iov[2], out_iov[3] = {}; int out_buftype[3] = {}; - int cmds[2] = { SMB2_OP_QUERY_INFO, }; + int cmds[2]; bool islink; int i, num_cmds; int rc, rc2; @@ -670,20 +707,36 @@ int smb2_query_path_info(const unsigned int xid, data->adjust_tz = false; data->reparse_point = false; - if (strcmp(full_path, "")) - rc = -ENOENT; - else - rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid); - /* If it is a root and its handle is cached then use it */ - if (!rc) { - if (cfid->file_all_info_is_valid) { - memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi)); + /* + * BB TODO: Add support for using cached root handle in SMB3.1.1 POSIX. + * Create SMB2_query_posix_info worker function to do non-compounded + * query when we already have an open file handle for this. For now this + * is fast enough (always using the compounded version). + */ + if (!tcon->posix_extensions) { + if (*full_path) { + rc = -ENOENT; } else { - rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid, - cfid->fid.volatile_fid, &data->fi); + rc = open_cached_dir(xid, tcon, full_path, + cifs_sb, false, &cfid); + } + /* If it is a root and its handle is cached then use it */ + if (!rc) { + if (cfid->file_all_info_is_valid) { + memcpy(&data->fi, &cfid->file_all_info, + sizeof(data->fi)); + } else { + rc = SMB2_query_info(xid, tcon, + cfid->fid.persistent_fid, + cfid->fid.volatile_fid, + &data->fi); + } + close_cached_dir(cfid); + return rc; } - close_cached_dir(cfid); - return rc; + cmds[0] = SMB2_OP_QUERY_INFO; + } else { + cmds[0] = SMB2_OP_POSIX_QUERY_INFO; } in_iov[0].iov_base = data; @@ -693,9 +746,8 @@ int smb2_query_path_info(const unsigned int xid, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - in_iov, cmds, 1, cfile, - NULL, NULL, out_iov, out_buftype); + create_options, ACL_NO_MODE, in_iov, + cmds, 1, cfile, out_iov, out_buftype); hdr = out_iov[0].iov_base; /* * If first iov is unset, then SMB session was dropped or we've got a @@ -707,6 +759,10 @@ int smb2_query_path_info(const unsigned int xid, switch (rc) { case 0: case -EOPNOTSUPP: + /* + * BB TODO: When support for special files added to Samba + * re-verify this path. + */ rc = parse_create_response(data, cifs_sb, &out_iov[0]); if (rc || !data->reparse_point) goto out; @@ -722,8 +778,8 @@ int smb2_query_path_info(const unsigned int xid, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, cmds, - num_cmds, cfile, NULL, NULL, NULL, NULL); + create_options, ACL_NO_MODE, in_iov, + cmds, num_cmds, cfile, NULL, NULL); break; case -EREMOTE: break; @@ -746,101 +802,6 @@ out: return rc; } -int smb311_posix_query_path_info(const unsigned int xid, - struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, - struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group) -{ - int rc; - __u32 create_options = 0; - struct cifsFileInfo *cfile; - struct kvec in_iov[2], out_iov[3] = {}; - int out_buftype[3] = {}; - __u8 *sidsbuf = NULL; - __u8 *sidsbuf_end = NULL; - size_t sidsbuflen = 0; - size_t owner_len, group_len; - int cmds[2] = { SMB2_OP_POSIX_QUERY_INFO, }; - int i, num_cmds; - - data->adjust_tz = false; - data->reparse_point = false; - - /* - * BB TODO: Add support for using the cached root handle. - * Create SMB2_query_posix_info worker function to do non-compounded query - * when we already have an open file handle for this. For now this is fast enough - * (always using the compounded version). - */ - in_iov[0].iov_base = data; - in_iov[0].iov_len = sizeof(*data); - in_iov[1] = in_iov[0]; - - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, cmds, 1, - cfile, &sidsbuf, &sidsbuflen, out_iov, out_buftype); - /* - * If first iov is unset, then SMB session was dropped or we've got a - * cached open file (@cfile). - */ - if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER) - goto out; - - switch (rc) { - case 0: - case -EOPNOTSUPP: - /* BB TODO: When support for special files added to Samba re-verify this path */ - rc = parse_create_response(data, cifs_sb, &out_iov[0]); - if (rc || !data->reparse_point) - goto out; - - if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) { - /* symlink already parsed in create response */ - num_cmds = 1; - } else { - cmds[1] = SMB2_OP_GET_REPARSE; - num_cmds = 2; - } - create_options |= OPEN_REPARSE_POINT; - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, cmds, - num_cmds, cfile, &sidsbuf, &sidsbuflen, NULL, NULL); - break; - } - -out: - if (rc == 0) { - sidsbuf_end = sidsbuf + sidsbuflen; - - owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); - if (owner_len == -1) { - rc = -EINVAL; - goto out; - } - memcpy(owner, sidsbuf, owner_len); - - group_len = posix_info_sid_size( - sidsbuf + owner_len, sidsbuf_end); - if (group_len == -1) { - rc = -EINVAL; - goto out; - } - memcpy(group, sidsbuf + owner_len, group_len); - } - - kfree(sidsbuf); - for (i = 0; i < ARRAY_SIZE(out_buftype); i++) - free_rsp_buf(out_buftype[i], out_iov[i].iov_base); - return rc; -} - int smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, struct cifs_tcon *tcon, const char *name, @@ -848,9 +809,9 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, mode, NULL, - &(int){SMB2_OP_MKDIR}, 1, - NULL, NULL, NULL, NULL, NULL); + CREATE_NOT_FILE, mode, + NULL, &(int){SMB2_OP_MKDIR}, 1, + NULL, NULL, NULL); } void @@ -875,7 +836,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, &in_iov, &(int){SMB2_OP_SET_INFO}, 1, - cfile, NULL, NULL, NULL, NULL); + cfile, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -887,8 +848,9 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, drop_cached_dir_by_name(xid, tcon, name, cifs_sb); return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, - ACL_NO_MODE, NULL, &(int){SMB2_OP_RMDIR}, 1, - NULL, NULL, NULL, NULL, NULL); + ACL_NO_MODE, NULL, + &(int){SMB2_OP_RMDIR}, 1, + NULL, NULL, NULL); } int @@ -897,8 +859,9 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, NULL, NULL); + ACL_NO_MODE, NULL, + &(int){SMB2_OP_DELETE}, 1, + NULL, NULL, NULL); } static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, @@ -919,8 +882,8 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, in_iov.iov_base = smb2_to_name; in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX); rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, create_options, ACL_NO_MODE, &in_iov, - &command, 1, cfile, NULL, NULL, NULL, NULL); + FILE_OPEN, create_options, ACL_NO_MODE, + &in_iov, &command, 1, cfile, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -971,7 +934,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, &in_iov, &(int){SMB2_OP_SET_EOF}, 1, - cfile, NULL, NULL, NULL, NULL); + cfile, NULL, NULL); } int @@ -999,8 +962,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, ACL_NO_MODE, &in_iov, - &(int){SMB2_OP_SET_INFO}, 1, cfile, - NULL, NULL, NULL, NULL); + &(int){SMB2_OP_SET_INFO}, 1, + cfile, NULL, NULL); cifs_put_tlink(tlink); return rc; } @@ -1035,7 +998,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, da, cd, co, ACL_NO_MODE, in_iov, - cmds, 2, cfile, NULL, NULL, NULL, NULL); + cmds, 2, cfile, NULL, NULL); if (!rc) { rc = smb311_posix_get_inode_info(&new, full_path, data, sb, xid); @@ -1045,7 +1008,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, da, cd, co, ACL_NO_MODE, in_iov, - cmds, 2, cfile, NULL, NULL, NULL, NULL); + cmds, 2, cfile, NULL, NULL); if (!rc) { rc = cifs_get_inode_info(&new, full_path, data, sb, xid, NULL); @@ -1072,8 +1035,8 @@ int smb2_query_reparse_point(const unsigned int xid, rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov, - &(int){SMB2_OP_GET_REPARSE}, 1, cfile, - NULL, NULL, NULL, NULL); + &(int){SMB2_OP_GET_REPARSE}, 1, + cfile, NULL, NULL); if (rc) goto out; diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index 1a90dd78b238..ac1895358908 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -1210,6 +1210,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"}, {STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"}, {STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"}, + {STATUS_SERVER_UNAVAILABLE, -EAGAIN, "STATUS_SERVER_UNAVAILABLE"}, + {STATUS_FILE_NOT_AVAILABLE, -EAGAIN, "STATUS_FILE_NOT_AVAILABLE"}, {STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"}, {STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"}, {STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"}, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 01a5bd7e6a30..4695433fcf39 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -614,11 +614,12 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, "multichannel not available\n" "Empty network interface list returned by server %s\n", ses->server->hostname); - rc = -EINVAL; + rc = -EOPNOTSUPP; + ses->iface_last_update = jiffies; goto out; } - while (bytes_left >= sizeof(*p)) { + while (bytes_left >= (ssize_t)sizeof(*p)) { memset(&tmp_iface, 0, sizeof(tmp_iface)); tmp_iface.speed = le64_to_cpu(p->LinkSpeed); tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; @@ -712,7 +713,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, ses->iface_count++; spin_unlock(&ses->iface_lock); - ses->iface_last_update = jiffies; next_iface: nb_iface++; next = le32_to_cpu(p->Next); @@ -734,11 +734,7 @@ next_iface: if ((bytes_left > 8) || p->Next) cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); - - if (!ses->iface_count) { - rc = -EINVAL; - goto out; - } + ses->iface_last_update = jiffies; out: /* @@ -1112,7 +1108,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb_rqst *rqst; struct kvec *rsp_iov; __le16 *utf16_path = NULL; @@ -1128,6 +1124,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_file_full_ea_info *ea = NULL; struct smb2_query_info_rsp *rsp; int rc, used_len = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1201,6 +1204,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, 0), .fid = &fid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -1248,6 +1252,12 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, goto sea_exit; smb2_set_related(&rqst[2]); + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + smb2_set_replay(server, &rqst[2]); + } + rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); @@ -1264,6 +1274,11 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, kfree(vars); out_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } #endif @@ -1488,7 +1503,7 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_rqst *rqst; struct kvec *rsp_iov; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; char __user *arg = (char __user *)p; struct smb_query_info qi; struct smb_query_info __user *pqi; @@ -1505,6 +1520,13 @@ smb2_ioctl_query_info(const unsigned int xid, void *data[2]; int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR; void (*free_req1_func)(struct smb_rqst *r); + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -1548,6 +1570,7 @@ smb2_ioctl_query_info(const unsigned int xid, .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, create_options), .fid = &fid, + .replay = !!(retries), }; if (qi.flags & PASSTHRU_FSCTL) { @@ -1645,6 +1668,12 @@ smb2_ioctl_query_info(const unsigned int xid, goto free_req_1; smb2_set_related(&rqst[2]); + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + smb2_set_replay(server, &rqst[2]); + } + rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); @@ -1705,6 +1734,11 @@ free_output_buffer: kfree(buffer); free_vars: kfree(vars); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -2231,8 +2265,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct smb2_query_directory_rsp *qd_rsp = NULL; struct smb2_create_rsp *op_rsp = NULL; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - int retry_count = 0; + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(tcon->ses); utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) @@ -2257,6 +2297,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, 0), .fid = fid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -2282,14 +2323,15 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); -again: + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + } + rc = compound_send_recv(xid, tcon->ses, server, flags, 2, rqst, resp_buftype, rsp_iov); - if (rc == -EAGAIN && retry_count++ < 10) - goto again; - /* If the open failed there is nothing to do */ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) { @@ -2337,6 +2379,11 @@ again: SMB2_query_directory_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -2462,6 +2509,22 @@ smb2_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid, } void +smb2_set_replay(struct TCP_Server_Info *server, struct smb_rqst *rqst) +{ + struct smb2_hdr *shdr; + + if (server->dialect < SMB30_PROT_ID) + return; + + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); + if (shdr == NULL) { + cifs_dbg(FYI, "shdr NULL in smb2_set_related\n"); + return; + } + shdr->Flags |= SMB2_FLAGS_REPLAY_OPERATION; +} + +void smb2_set_related(struct smb_rqst *rqst) { struct smb2_hdr *shdr; @@ -2534,6 +2597,27 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* + * helper function for exponential backoff and check if replayable + */ +bool smb2_should_replay(struct cifs_tcon *tcon, + int *pretries, + int *pcur_sleep) +{ + if (!pretries || !pcur_sleep) + return false; + + if (tcon->retry || (*pretries)++ < tcon->ses->server->retrans) { + msleep(*pcur_sleep); + (*pcur_sleep) = ((*pcur_sleep) << 1); + if ((*pcur_sleep) > CIFS_MAX_SLEEP) + (*pcur_sleep) = CIFS_MAX_SLEEP; + return true; + } + + return false; +} + +/* * Passes the query info response back to the caller on success. * Caller need to free this with free_rsp_buf(). */ @@ -2546,7 +2630,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst *rqst; int resp_buftype[3]; @@ -2557,6 +2641,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, int rc; __le16 *utf16_path; struct cached_fid *cfid = NULL; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); if (!path) path = ""; @@ -2593,6 +2684,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, .disposition = FILE_OPEN, .create_options = cifs_create_options(cifs_sb, 0), .fid = &fid, + .replay = !!(retries), }; rc = SMB2_open_init(tcon, server, @@ -2637,6 +2729,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, goto qic_exit; smb2_set_related(&rqst[2]); + if (retries) { + if (!cfid) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[2]); + } + smb2_set_replay(server, &rqst[1]); + } + if (cfid) { rc = compound_send_recv(xid, ses, server, flags, 1, &rqst[1], @@ -2669,6 +2769,11 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, kfree(vars); out_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3217,6 +3322,9 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, new_size); if (rc >= 0) { truncate_setsize(inode, new_size); + netfs_resize_file(&cifsi->netfs, new_size, true); + if (offset < cifsi->netfs.zero_point) + cifsi->netfs.zero_point = offset; fscache_resize_cookie(cifs_inode_cookie(inode), new_size); } } @@ -3440,7 +3548,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, new_eof); if (rc == 0) { - cifsi->server_eof = new_eof; + netfs_resize_file(&cifsi->netfs, new_eof, true); cifs_setsize(inode, new_eof); cifs_truncate_page(inode->i_mapping, inode->i_size); truncate_setsize(inode, new_eof); @@ -3532,8 +3640,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, int rc; unsigned int xid; struct inode *inode = file_inode(file); - struct cifsFileInfo *cfile = file->private_data; struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifsFileInfo *cfile = file->private_data; + struct netfs_inode *ictx = &cifsi->netfs; loff_t old_eof, new_eof; xid = get_xid(); @@ -3553,6 +3662,7 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_pagecache_range(inode, off, old_eof); + ictx->zero_point = old_eof; rc = smb2_copychunk_range(xid, cfile, cfile, off + len, old_eof - off - len, off); @@ -3567,9 +3677,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, rc = 0; - cifsi->server_eof = i_size_read(inode) - len; - truncate_setsize(inode, cifsi->server_eof); - fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof); + truncate_setsize(inode, new_eof); + netfs_resize_file(&cifsi->netfs, new_eof, true); + ictx->zero_point = new_eof; + fscache_resize_cookie(cifs_inode_cookie(inode), new_eof); out_2: filemap_invalidate_unlock(inode->i_mapping); out: @@ -3585,6 +3696,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, unsigned int xid; struct cifsFileInfo *cfile = file->private_data; struct inode *inode = file_inode(file); + struct cifsInodeInfo *cifsi = CIFS_I(inode); __u64 count, old_eof, new_eof; xid = get_xid(); @@ -3612,6 +3724,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_setsize(inode, new_eof); + netfs_resize_file(&cifsi->netfs, i_size_read(inode), true); fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode)); rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); @@ -5104,7 +5217,7 @@ static int smb2_create_reparse_symlink(const unsigned int xid, struct inode *new; struct kvec iov; __le16 *path; - char *sym; + char *sym, sep = CIFS_DIR_SEP(cifs_sb); u16 len, plen; int rc = 0; @@ -5118,7 +5231,8 @@ static int smb2_create_reparse_symlink(const unsigned int xid, .symlink_target = sym, }; - path = cifs_convert_path_to_utf16(symname, cifs_sb); + convert_delimiter(sym, sep); + path = cifs_convert_path_to_utf16(sym, cifs_sb); if (!path) { rc = -ENOMEM; goto out; @@ -5141,7 +5255,10 @@ static int smb2_create_reparse_symlink(const unsigned int xid, buf->PrintNameLength = cpu_to_le16(plen); memcpy(buf->PathBuffer, path, plen); buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0); + if (*sym != sep) + buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE); + convert_delimiter(sym, '/'); iov.iov_base = buf; iov.iov_len = len; new = smb2_get_reparse_inode(&data, inode->i_sb, xid, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index bd25c34dc398..608ee05491e2 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -156,6 +156,56 @@ out: return; } +/* helper function for code reuse */ +static int +cifs_chan_skip_or_disable(struct cifs_ses *ses, + struct TCP_Server_Info *server, + bool from_reconnect) +{ + struct TCP_Server_Info *pserver; + unsigned int chan_index; + + if (SERVER_IS_CHAN(server)) { + cifs_dbg(VFS, + "server %s does not support multichannel anymore. Skip secondary channel\n", + ses->server->hostname); + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + goto skip_terminate; + } + + ses->chans[chan_index].server = NULL; + server->terminate = true; + spin_unlock(&ses->chan_lock); + + /* + * the above reference of server by channel + * needs to be dropped without holding chan_lock + * as cifs_put_tcp_session takes a higher lock + * i.e. cifs_tcp_ses_lock + */ + cifs_put_tcp_session(server, from_reconnect); + + cifs_signal_cifsd_for_reconnect(server, false); + + /* mark primary server as needing reconnect */ + pserver = server->primary_server; + cifs_signal_cifsd_for_reconnect(pserver, false); +skip_terminate: + return -EHOSTDOWN; + } + + cifs_server_dbg(VFS, + "server does not support multichannel anymore. Disable all other channels\n"); + cifs_disable_secondary_channels(ses); + + + return 0; +} + static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server, bool from_reconnect) @@ -164,8 +214,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct nls_table *nls_codepage = NULL; struct cifs_ses *ses; int xid; - struct TCP_Server_Info *pserver; - unsigned int chan_index; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -310,44 +358,11 @@ again: */ if (ses->chan_count > 1 && !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - if (SERVER_IS_CHAN(server)) { - cifs_dbg(VFS, "server %s does not support " \ - "multichannel anymore. skipping secondary channel\n", - ses->server->hostname); - - spin_lock(&ses->chan_lock); - chan_index = cifs_ses_get_chan_index(ses, server); - if (chan_index == CIFS_INVAL_CHAN_INDEX) { - spin_unlock(&ses->chan_lock); - goto skip_terminate; - } - - ses->chans[chan_index].server = NULL; - spin_unlock(&ses->chan_lock); - - /* - * the above reference of server by channel - * needs to be dropped without holding chan_lock - * as cifs_put_tcp_session takes a higher lock - * i.e. cifs_tcp_ses_lock - */ - cifs_put_tcp_session(server, from_reconnect); - - server->terminate = true; - cifs_signal_cifsd_for_reconnect(server, false); - - /* mark primary server as needing reconnect */ - pserver = server->primary_server; - cifs_signal_cifsd_for_reconnect(pserver, false); - -skip_terminate: + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + if (rc) { mutex_unlock(&ses->session_mutex); - rc = -EHOSTDOWN; goto out; - } else { - cifs_server_dbg(VFS, "does not support " \ - "multichannel anymore. disabling all other channels\n"); - cifs_disable_secondary_channels(ses); } } @@ -384,6 +399,15 @@ skip_sess_setup: goto out; } + spin_lock(&ses->ses_lock); + if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) { + spin_unlock(&ses->ses_lock); + mutex_unlock(&ses->session_mutex); + goto skip_add_channels; + } + ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); + if (!rc && (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { mutex_unlock(&ses->session_mutex); @@ -395,14 +419,29 @@ skip_sess_setup: rc = SMB3_request_interfaces(xid, tcon, false); free_xid(xid); - if (rc) + if (rc == -EOPNOTSUPP && ses->chan_count > 1) { + /* + * some servers like Azure SMB server do not advertise + * that multichannel has been disabled with server + * capabilities, rather return STATUS_NOT_IMPLEMENTED. + * treat this as server not supporting multichannel + */ + + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + goto skip_add_channels; + } else if (rc) cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); if (ses->chan_max > ses->chan_count && + ses->iface_count && !SERVER_IS_CHAN(server)) { - if (ses->chan_count == 1) + if (ses->chan_count == 1) { cifs_server_dbg(VFS, "supports multichannel now\n"); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + } cifs_try_adding_channels(ses); } @@ -410,6 +449,11 @@ skip_sess_setup: mutex_unlock(&ses->session_mutex); } +skip_add_channels: + spin_lock(&ses->ses_lock); + ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); + if (smb2_command != SMB2_INTERNAL_CMD) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -1958,10 +2002,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, __le16 *unc_path = NULL; int flags = 0; unsigned int total_len; - struct TCP_Server_Info *server; - - /* always use master channel */ - server = ses->server; + struct TCP_Server_Info *server = cifs_pick_channel(ses); cifs_dbg(FYI, "TCON\n"); @@ -2094,6 +2135,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) struct smb2_tree_disconnect_req *req; /* response is trivial */ int rc = 0; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = 0; unsigned int total_len; struct kvec iov[1]; @@ -2116,7 +2158,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) invalidate_all_cached_dirs(tcon); - rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server, + rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, server, (void **) &req, &total_len); if (rc) @@ -2134,7 +2176,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, ses->server, + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -2279,7 +2321,7 @@ int smb2_parse_contexts(struct TCP_Server_Info *server, noff = le16_to_cpu(cc->NameOffset); nlen = le16_to_cpu(cc->NameLength); - if (noff + nlen >= doff) + if (noff + nlen > doff) return -EINVAL; name = (char *)cc + noff; @@ -2362,8 +2404,13 @@ create_durable_v2_buf(struct cifs_open_parms *oparms) */ buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); - generate_random_uuid(buf->dcontext.CreateGuid); - memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); + + /* for replay, we should not overwrite the existing create guid */ + if (!oparms->replay) { + generate_random_uuid(buf->dcontext.CreateGuid); + memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); + } else + memcpy(buf->dcontext.CreateGuid, pfid->create_guid, 16); /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */ buf->Name[0] = 'D'; @@ -2736,7 +2783,14 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, int flags = 0; unsigned int total_len; __le16 *utf16_path = NULL; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + n_iov = 2; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "mkdir\n"); @@ -2840,6 +2894,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, /* no need to inc num_remote_opens because we close it just below */ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, full_path, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); + + if (retries) + smb2_set_replay(server, &rqst); + /* resource #4: response buffer */ rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -2877,6 +2935,11 @@ err_free_req: cifs_small_buf_release(req); err_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3072,12 +3135,19 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct smb2_create_rsp *rsp = NULL; struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct kvec iov[SMB2_CREATE_IOV_SIZE]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); + oparms->replay = !!(retries); cifs_dbg(FYI, "create/open\n"); if (!ses || !server) @@ -3099,6 +3169,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, oparms->path, oparms->create_options, oparms->desired_access); + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3152,6 +3225,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, creat_exit: SMB2_open_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3276,15 +3354,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; - - cifs_dbg(FYI, "SMB2 IOCTL\n"); - - if (out_data != NULL) - *out_data = NULL; - - /* zero out returned data len, in case of error */ - if (plen) - *plen = 0; + int retries = 0, cur_sleep = 1; if (!tcon) return -EIO; @@ -3293,10 +3363,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (!ses) return -EIO; +replay_again: + /* reinitialize for possible replay */ + flags = 0; server = cifs_pick_channel(ses); + if (!server) return -EIO; + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + if (out_data != NULL) + *out_data = NULL; + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -3311,6 +3394,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) goto ioctl_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3380,6 +3466,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, ioctl_exit: SMB2_ioctl_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3451,13 +3542,20 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, struct smb_rqst rqst; struct smb2_close_rsp *rsp = NULL; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct kvec iov[1]; struct kvec rsp_iov; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; bool query_attrs = false; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + query_attrs = false; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "Close\n"); @@ -3483,6 +3581,9 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto close_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; @@ -3516,6 +3617,11 @@ close_exit: cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n", persistent_fid, tmp_rc); } + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3646,12 +3752,19 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, struct TCP_Server_Info *server; int flags = 0; bool allocated = false; + int retries = 0, cur_sleep = 1; cifs_dbg(FYI, "Query Info\n"); if (!ses) return -EIO; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + allocated = false; server = cifs_pick_channel(ses); + if (!server) return -EIO; @@ -3673,6 +3786,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid, ses->Suid, info_class, (__u32)info_type); + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -3715,6 +3831,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, qinf_exit: SMB2_query_info_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3815,7 +3936,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u32 *plen /* returned data len */) { struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb_rqst rqst; struct smb2_change_notify_rsp *smb_rsp; struct kvec iov[1]; @@ -3823,6 +3944,12 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype = CIFS_NO_BUFFER; int flags = 0; int rc = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "change notify\n"); if (!ses || !server) @@ -3847,6 +3974,10 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid, (u8)watch_tree, completion_filter); + + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3881,6 +4012,11 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, if (rqst.rq_iov) cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */ free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3918,7 +4054,7 @@ void smb2_reconnect_server(struct work_struct *work) struct cifs_ses *ses, *ses2; struct cifs_tcon *tcon, *tcon2; struct list_head tmp_list, tmp_ses_list; - bool tcon_exist = false, ses_exist = false; + bool ses_exist = false; bool tcon_selected = false; int rc; bool resched = false; @@ -3964,7 +4100,7 @@ void smb2_reconnect_server(struct work_struct *work) if (tcon->need_reconnect || tcon->need_reopen_files) { tcon->tc_count++; list_add_tail(&tcon->rlist, &tmp_list); - tcon_selected = tcon_exist = true; + tcon_selected = true; } } /* @@ -3973,7 +4109,7 @@ void smb2_reconnect_server(struct work_struct *work) */ if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); - tcon_selected = tcon_exist = true; + tcon_selected = true; cifs_smb_ses_inc_refcount(ses); } /* @@ -4123,10 +4259,16 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct smb_rqst rqst; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int resp_buftype = CIFS_NO_BUFFER; int flags = 0; int rc = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "flush\n"); if (!ses || !(ses->server)) @@ -4146,6 +4288,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto flush_exit; trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid); + + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4160,6 +4306,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, flush_exit: SMB2_flush_free(&rqst); free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -4639,7 +4790,7 @@ smb2_async_writev(struct cifs_writedata *wdata, struct cifs_io_parms *io_parms = NULL; int credit_request; - if (!wdata->server) + if (!wdata->server || wdata->replay) server = wdata->server = cifs_pick_channel(tcon->ses); /* @@ -4724,6 +4875,8 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_nvec = 1; rqst.rq_iter = wdata->iter; rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); + if (wdata->replay) + smb2_set_replay(server, &rqst); #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); @@ -4797,18 +4950,21 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, int flags = 0; unsigned int total_len; struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; +replay_again: + /* reinitialize for possible replay */ + flags = 0; *nbytes = 0; - - if (n_vec < 1) - return rc; - if (!io_parms->server) io_parms->server = cifs_pick_channel(io_parms->tcon->ses); server = io_parms->server; if (server == NULL) return -ECONNABORTED; + if (n_vec < 1) + return rc; + rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, server, (void **) &req, &total_len); if (rc) @@ -4842,6 +4998,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_iov = iov; rqst.rq_nvec = n_vec + 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, io_parms->tcon->ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4866,6 +5025,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_small_buf_release(req); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(io_parms->tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5048,6 +5212,9 @@ int SMB2_query_directory_init(const unsigned int xid, case SMB_FIND_FILE_POSIX_INFO: req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO; break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + req->FileInformationClass = FILE_FULL_DIRECTORY_INFORMATION; + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", info_level); @@ -5117,6 +5284,9 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, /* note that posix payload are variable size */ info_buf_size = sizeof(struct smb2_posix_info); break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + info_buf_size = sizeof(FILE_FULL_DIRECTORY_INFO); + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", srch_inf->info_level); @@ -5177,8 +5347,14 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, struct kvec rsp_iov; int rc = 0; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (!ses || !(ses->server)) return -EIO; @@ -5198,6 +5374,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto qdir_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -5232,6 +5411,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, qdir_exit: SMB2_query_directory_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5298,8 +5482,14 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (!ses || !server) return -EIO; @@ -5327,6 +5517,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } + if (retries) + smb2_set_replay(server, &rqst); rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, @@ -5342,6 +5534,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, free_rsp_buf(resp_buftype, rsp); kfree(iov); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5394,12 +5591,18 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_oplock_break *req = NULL; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = CIFS_OBREAK_OP; unsigned int total_len; struct kvec iov[1]; struct kvec rsp_iov; int resp_buf_type; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_OBREAK_OP; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "SMB2_oplock_break\n"); rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server, @@ -5424,15 +5627,21 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc); } + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5518,9 +5727,15 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; FILE_SYSTEM_POSIX_INFO *info = NULL; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); rc = build_qfs_info_req(&iov, tcon, server, FS_POSIX_INFORMATION, @@ -5536,6 +5751,9 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5555,6 +5773,11 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, posix_qfsinf_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5569,9 +5792,15 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb2_fs_full_size_info *info = NULL; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); rc = build_qfs_info_req(&iov, tcon, server, FS_FULL_SIZE_INFORMATION, @@ -5587,6 +5816,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5606,6 +5838,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, qfsinf_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5620,9 +5857,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype, max_len, min_len; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; unsigned int rsp_len, offset; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (level == FS_DEVICE_INFORMATION) { max_len = sizeof(FILE_SYSTEM_DEVICE_INFO); @@ -5654,6 +5897,9 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5691,6 +5937,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, qfsattr_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5708,7 +5959,13 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, unsigned int count; int flags = CIFS_NO_RSP_BUF; unsigned int total_len; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_NO_RSP_BUF; + server = cifs_pick_channel(tcon->ses); cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); @@ -5739,6 +5996,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 2; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, tcon->ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); @@ -5750,6 +6010,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, tcon->ses->Suid, rc); } + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 343ada691e76..b3069911e9dd 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -122,6 +122,11 @@ extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, extern void smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst); extern void smb2_set_related(struct smb_rqst *rqst); +extern void smb2_set_replay(struct TCP_Server_Info *server, + struct smb_rqst *rqst); +extern bool smb2_should_replay(struct cifs_tcon *tcon, + int *pretries, + int *pcur_sleep); /* * SMB2 Worker functions - most of protocol specific implementation details @@ -299,9 +304,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group); + struct cifs_open_info_data *data); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); diff --git a/fs/smb/client/smb2status.h b/fs/smb/client/smb2status.h index a9e958166fc5..9c6d79b0bd49 100644 --- a/fs/smb/client/smb2status.h +++ b/fs/smb/client/smb2status.h @@ -982,6 +982,8 @@ struct ntstatus { #define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501) #define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502) #define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503) +#define STATUS_SERVER_UNAVAILABLE cpu_to_le32(0xC0000466) +#define STATUS_FILE_NOT_AVAILABLE cpu_to_le32(0xC0000467) #define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700) #define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701) #define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702) diff --git a/fs/smb/client/smbencrypt.c b/fs/smb/client/smbencrypt.c index f0ce26414f17..1d1ee9f18f37 100644 --- a/fs/smb/client/smbencrypt.c +++ b/fs/smb/client/smbencrypt.c @@ -26,13 +26,6 @@ #include "cifsproto.h" #include "../common/md4.h" -#ifndef false -#define false 0 -#endif -#ifndef true -#define true 1 -#endif - /* following came from the other byteorder.h to avoid include conflicts */ #define CVAL(buf,pos) (((unsigned char *)(buf))[pos]) #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 4f717ad7c21b..994d70193432 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -400,10 +400,17 @@ unmask: server->conn_id, server->hostname); } smbd_done: - if (rc < 0 && rc != -EINTR) + /* + * there's hardly any use for the layers above to know the + * actual error code here. All they should do at this point is + * to retry the connection and hope it goes away. + */ + if (rc < 0 && rc != -EINTR && rc != -EAGAIN) { cifs_server_dbg(VFS, "Error %d sending data on socket to server\n", rc); - else if (rc > 0) + rc = -ECONNABORTED; + cifs_signal_cifsd_for_reconnect(server, false); + } else if (rc > 0) rc = 0; out: cifs_in_send_dec(server); @@ -428,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, if (!(flags & CIFS_TRANSFORM_REQ)) return __smb_send_rqst(server, num_rqst, rqst); - if (num_rqst > MAX_COMPOUND - 1) - return -ENOMEM; + if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1)) + return -EIO; if (!server->ops->init_transform_rq) { cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n"); @@ -1026,6 +1033,9 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!server || server->terminate) continue; + if (CIFS_CHAN_NEEDS_RECONNECT(ses, i)) + continue; + /* * strictly speaking, we should pick up req_lock to read * server->in_flight. But it shouldn't matter much here if we diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c index 4a4b2b03ff33..b931a99ab9c8 100644 --- a/fs/smb/server/asn1.c +++ b/fs/smb/server/asn1.c @@ -214,10 +214,15 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, { struct ksmbd_conn *conn = context; + if (!vlen) + return -EINVAL; + conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL); if (!conn->mechToken) return -ENOMEM; + conn->mechTokenLen = (unsigned int)vlen; + return 0; } diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index d311c2ee10bd..09e1e7771592 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -416,13 +416,7 @@ static void stop_sessions(void) again: down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { - struct task_struct *task; - t = conn->transport; - task = t->handler; - if (task) - ksmbd_debug(CONN, "Stop session handler %s/%d\n", - task->comm, task_pid_nr(task)); ksmbd_conn_set_exiting(conn); if (t->ops->shutdown) { up_read(&conn_list_lock); diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 3c005246a32e..0e04cf8b1d89 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -88,6 +88,7 @@ struct ksmbd_conn { __u16 dialect; char *mechToken; + unsigned int mechTokenLen; struct ksmbd_conn_ops *conn_ops; @@ -134,7 +135,6 @@ struct ksmbd_transport_ops { struct ksmbd_transport { struct ksmbd_conn *conn; struct ksmbd_transport_ops *ops; - struct task_struct *handler; }; #define KSMBD_TCP_RECV_TIMEOUT (7 * HZ) diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index b7521e41402e..0ebf91ffa236 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -304,7 +304,8 @@ enum ksmbd_event { KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15, - KSMBD_EVENT_MAX + __KSMBD_EVENT_MAX, + KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1 }; /* diff --git a/fs/smb/server/misc.c b/fs/smb/server/misc.c index 9e8afaa686e3..1a5faa6f6e7b 100644 --- a/fs/smb/server/misc.c +++ b/fs/smb/server/misc.c @@ -261,6 +261,7 @@ out_ascii: /** * ksmbd_extract_sharename() - get share name from tree connect request + * @um: pointer to a unicode_map structure for character encoding handling * @treename: buffer containing tree name and share name * * Return: share name on success, otherwise error diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 001926d3b348..53dfaac425c6 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1197,6 +1197,12 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, bool prev_op_has_lease; __le32 prev_op_state = 0; + /* Only v2 leases handle the directory */ + if (S_ISDIR(file_inode(fp->filp)->i_mode)) { + if (!lctx || lctx->version != 2) + return 0; + } + opinfo = alloc_opinfo(work, pid, tid); if (!opinfo) return -ENOMEM; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 3143819935dc..089527a8b4ff 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1414,7 +1414,10 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn, char *name; unsigned int name_off, name_len, secbuf_len; - secbuf_len = le16_to_cpu(req->SecurityBufferLength); + if (conn->use_spnego && conn->mechToken) + secbuf_len = conn->mechTokenLen; + else + secbuf_len = le16_to_cpu(req->SecurityBufferLength); if (secbuf_len < sizeof(struct authenticate_message)) { ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len); return NULL; @@ -1505,7 +1508,10 @@ static int ntlm_authenticate(struct ksmbd_work *work, struct authenticate_message *authblob; authblob = user_authblob(conn, req); - sz = le16_to_cpu(req->SecurityBufferLength); + if (conn->use_spnego && conn->mechToken) + sz = conn->mechTokenLen; + else + sz = le16_to_cpu(req->SecurityBufferLength); rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess); if (rc) { set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD); @@ -1778,8 +1784,7 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off = le16_to_cpu(req->SecurityBufferOffset); negblob_len = le16_to_cpu(req->SecurityBufferLength); - if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) || - negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer)) { rc = -EINVAL; goto out_err; } @@ -1788,8 +1793,15 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off); if (decode_negotiation_token(conn, negblob, negblob_len) == 0) { - if (conn->mechToken) + if (conn->mechToken) { negblob = (struct negotiate_message *)conn->mechToken; + negblob_len = conn->mechTokenLen; + } + } + + if (negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + rc = -EINVAL; + goto out_err; } if (server_conf.auth_mechs & conn->auth_mechs) { @@ -6161,8 +6173,10 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); - if (err) + if (err) { + kvfree(aux_payload_buf); goto out; + } kvfree(rpc_resp); } else { err = ksmbd_iov_pin_rsp(work, (void *)rsp, @@ -6372,8 +6386,10 @@ int smb2_read(struct ksmbd_work *work) err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); - if (err) + if (err) { + kvfree(aux_payload_buf); goto out; + } ksmbd_fd_put(work, fp); return 0; @@ -6748,10 +6764,10 @@ struct file_lock *smb_flock_init(struct file *f) locks_init_lock(fl); - fl->fl_owner = f; - fl->fl_pid = current->tgid; - fl->fl_file = f; - fl->fl_flags = FL_POSIX; + fl->c.flc_owner = f; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = f; + fl->c.flc_flags = FL_POSIX; fl->fl_ops = NULL; fl->fl_lmops = NULL; @@ -6768,30 +6784,30 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags) case SMB2_LOCKFLAG_SHARED: ksmbd_debug(SMB, "received shared request\n"); cmd = F_SETLKW; - flock->fl_type = F_RDLCK; - flock->fl_flags |= FL_SLEEP; + flock->c.flc_type = F_RDLCK; + flock->c.flc_flags |= FL_SLEEP; break; case SMB2_LOCKFLAG_EXCLUSIVE: ksmbd_debug(SMB, "received exclusive request\n"); cmd = F_SETLKW; - flock->fl_type = F_WRLCK; - flock->fl_flags |= FL_SLEEP; + flock->c.flc_type = F_WRLCK; + flock->c.flc_flags |= FL_SLEEP; break; case SMB2_LOCKFLAG_SHARED | SMB2_LOCKFLAG_FAIL_IMMEDIATELY: ksmbd_debug(SMB, "received shared & fail immediately request\n"); cmd = F_SETLK; - flock->fl_type = F_RDLCK; + flock->c.flc_type = F_RDLCK; break; case SMB2_LOCKFLAG_EXCLUSIVE | SMB2_LOCKFLAG_FAIL_IMMEDIATELY: ksmbd_debug(SMB, "received exclusive & fail immediately request\n"); cmd = F_SETLK; - flock->fl_type = F_WRLCK; + flock->c.flc_type = F_WRLCK; break; case SMB2_LOCKFLAG_UNLOCK: ksmbd_debug(SMB, "received unlock request\n"); - flock->fl_type = F_UNLCK; + flock->c.flc_type = F_UNLCK; cmd = F_SETLK; break; } @@ -6829,13 +6845,13 @@ static void smb2_remove_blocked_lock(void **argv) struct file_lock *flock = (struct file_lock *)argv[0]; ksmbd_vfs_posix_lock_unblock(flock); - wake_up(&flock->fl_wait); + locks_wake_up(flock); } static inline bool lock_defer_pending(struct file_lock *fl) { /* check pending lock waiters */ - return waitqueue_active(&fl->fl_wait); + return waitqueue_active(&fl->c.flc_wait); } /** @@ -6926,8 +6942,8 @@ int smb2_lock(struct ksmbd_work *work) list_for_each_entry(cmp_lock, &lock_list, llist) { if (cmp_lock->fl->fl_start <= flock->fl_start && cmp_lock->fl->fl_end >= flock->fl_end) { - if (cmp_lock->fl->fl_type != F_UNLCK && - flock->fl_type != F_UNLCK) { + if (cmp_lock->fl->c.flc_type != F_UNLCK && + flock->c.flc_type != F_UNLCK) { pr_err("conflict two locks in one request\n"); err = -EINVAL; locks_free_lock(flock); @@ -6975,12 +6991,12 @@ int smb2_lock(struct ksmbd_work *work) list_for_each_entry(conn, &conn_list, conns_list) { spin_lock(&conn->llist_lock); list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) { - if (file_inode(cmp_lock->fl->fl_file) != - file_inode(smb_lock->fl->fl_file)) + if (file_inode(cmp_lock->fl->c.flc_file) != + file_inode(smb_lock->fl->c.flc_file)) continue; - if (smb_lock->fl->fl_type == F_UNLCK) { - if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file && + if (lock_is_unlock(smb_lock->fl)) { + if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file && cmp_lock->start == smb_lock->start && cmp_lock->end == smb_lock->end && !lock_defer_pending(cmp_lock->fl)) { @@ -6997,7 +7013,7 @@ int smb2_lock(struct ksmbd_work *work) continue; } - if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file) { + if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file) { if (smb_lock->flags & SMB2_LOCKFLAG_SHARED) continue; } else { @@ -7039,7 +7055,7 @@ int smb2_lock(struct ksmbd_work *work) } up_read(&conn_list_lock); out_check_cl: - if (smb_lock->fl->fl_type == F_UNLCK && nolock) { + if (lock_is_unlock(smb_lock->fl) && nolock) { pr_err("Try to unlock nolocked range\n"); rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED; goto out; @@ -7163,7 +7179,7 @@ out: struct file_lock *rlock = NULL; rlock = smb_flock_init(filp); - rlock->fl_type = F_UNLCK; + rlock->c.flc_type = F_UNLCK; rlock->fl_start = smb_lock->start; rlock->fl_end = smb_lock->end; diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index b49d47bdafc9..f29bb03f0dc4 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -74,7 +74,7 @@ static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info) static int handle_generic_event(struct sk_buff *skb, struct genl_info *info); static int ksmbd_ipc_heartbeat_request(void); -static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = { +static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = { [KSMBD_EVENT_UNSPEC] = { .len = 0, }, @@ -403,7 +403,7 @@ static int handle_generic_event(struct sk_buff *skb, struct genl_info *info) return -EPERM; #endif - if (type >= KSMBD_EVENT_MAX) { + if (type > KSMBD_EVENT_MAX) { WARN_ON(1); return -EINVAL; } diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c5629a68c8b7..8faa25c6e129 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2039,6 +2039,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) { struct smb_direct_transport *t; + struct task_struct *handler; int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { @@ -2056,11 +2057,11 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (ret) goto out_err; - KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, "ksmbd:r%u", - smb_direct_port); - if (IS_ERR(KSMBD_TRANS(t)->handler)) { - ret = PTR_ERR(KSMBD_TRANS(t)->handler); + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, "ksmbd:r%u", + smb_direct_port); + if (IS_ERR(handler)) { + ret = PTR_ERR(handler); pr_err("Can't start thread\n"); goto out_err; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index eff7a1d793f0..002a3f0dc7c5 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -185,6 +185,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) struct sockaddr *csin; int rc = 0; struct tcp_transport *t; + struct task_struct *handler; t = alloc_transport(client_sk); if (!t) { @@ -199,13 +200,13 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) goto out_error; } - KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, - "ksmbd:%u", - ksmbd_tcp_get_port(csin)); - if (IS_ERR(KSMBD_TRANS(t)->handler)) { + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, + "ksmbd:%u", + ksmbd_tcp_get_port(csin)); + if (IS_ERR(handler)) { pr_err("cannot start conn thread\n"); - rc = PTR_ERR(KSMBD_TRANS(t)->handler); + rc = PTR_ERR(handler); free_transport(t); } return rc; @@ -364,6 +365,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig, * @t: TCP transport instance * @buf: buffer to store read data from socket * @to_read: number of bytes to read from socket + * @max_retries: number of retries if reading from socket fails * * Return: on success return number of bytes read from socket, * otherwise return error number @@ -415,6 +417,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket) /** * create_socket - create socket for ksmbd/0 + * @iface: interface to bind the created socket to * * Return: 0 on success, error number otherwise */ diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index a6961bfe3e13..c487e834331a 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -337,18 +337,18 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end, return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(flock, &ctx->flc_posix, fl_list) { + for_each_file_lock(flock, &ctx->flc_posix) { /* check conflict locks */ if (flock->fl_end >= start && end >= flock->fl_start) { - if (flock->fl_type == F_RDLCK) { + if (lock_is_read(flock)) { if (type == WRITE) { pr_err("not allow write by shared lock\n"); error = 1; goto out; } - } else if (flock->fl_type == F_WRLCK) { + } else if (lock_is_write(flock)) { /* check owner in lock */ - if (flock->fl_file != filp) { + if (flock->c.flc_file != filp) { error = 1; pr_err("not allow rw access by exclusive lock from other opens\n"); goto out; @@ -1837,13 +1837,13 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, void ksmbd_vfs_posix_lock_wait(struct file_lock *flock) { - wait_event(flock->fl_wait, !flock->fl_blocker); + wait_event(flock->c.flc_wait, !flock->c.flc_blocker); } int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout) { - return wait_event_interruptible_timeout(flock->fl_wait, - !flock->fl_blocker, + return wait_event_interruptible_timeout(flock->c.flc_wait, + !flock->c.flc_blocker, timeout); } diff --git a/fs/super.c b/fs/super.c index d35e85295489..ee05ab6b37e7 100644 --- a/fs/super.c +++ b/fs/super.c @@ -274,9 +274,10 @@ static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); - int i; - - for (i = 0; i < SB_FREEZE_LEVELS; i++) + security_sb_free(s); + put_user_ns(s->s_user_ns); + kfree(s->s_subtype); + for (int i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } @@ -296,9 +297,6 @@ static void destroy_unused_super(struct super_block *s) super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - security_sb_free(s); - put_user_ns(s->s_user_ns); - kfree(s->s_subtype); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); @@ -409,9 +407,6 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); - security_sb_free(s); - put_user_ns(s->s_user_ns); - kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); } } @@ -1532,16 +1527,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; - bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); - if (IS_ERR(bdev_handle)) { + bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); + if (IS_ERR(bdev_file)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); - return PTR_ERR(bdev_handle); + return PTR_ERR(bdev_file); } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); /* * This really should be in blkdev_get_by_dev, but right now can't due @@ -1549,7 +1544,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - bdev_release(bdev_handle); + fput(bdev_file); return -EACCES; } @@ -1560,11 +1555,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - bdev_release(bdev_handle); + fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); - sb->s_bdev_handle = bdev_handle; + sb->s_bdev_file = bdev_file; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) @@ -1680,7 +1675,7 @@ void kill_block_super(struct super_block *sb) generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); - bdev_release(sb->s_bdev_handle); + fput(sb->s_bdev_file); } } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index b6b6796e1616..4df2afa551dc 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -81,7 +81,7 @@ void sysfs_remove_dir(struct kobject *kobj) struct kernfs_node *kn = kobj->sd; /* - * In general, kboject owner is responsible for ensuring removal + * In general, kobject owner is responsible for ensuring removal * doesn't race with other operations and sysfs doesn't provide any * protection; however, when @kobj is used as a symlink target, the * symlinking entity usually doesn't own @kobj and thus has no diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 5a915b2e68f5..76bc2d5e75a9 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -336,7 +336,7 @@ int __init sysv_init_icache(void) { sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, init_once); if (!sysv_inode_cachep) return -ENOMEM; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 410ab2a44d2f..19bcb51a2203 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh) return (sysv_zone_t*)((char*)bh->b_data + bh->b_size); } -/* - * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock) - */ static Indirect *get_branch(struct inode *inode, int depth, int offsets[], @@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode, bh = sb_bread(sb, block); if (!bh) goto failure; + read_lock(&pointers_lock); if (!verify_chain(chain, p)) goto changed; add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets); + read_unlock(&pointers_lock); if (!p->key) goto no_block; } return NULL; changed: + read_unlock(&pointers_lock); brelse(bh); *err = -EAGAIN; goto no_block; @@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b goto out; reread: - read_lock(&pointers_lock); partial = get_branch(inode, depth, offsets, chain, &err); - read_unlock(&pointers_lock); /* Simplest case - block found, no allocation needed */ if (!partial) { @@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode, *top = 0; for (k = depth; k > 1 && !offsets[k-1]; k--) ; + partial = get_branch(inode, k, offsets, chain, &err); write_lock(&pointers_lock); - partial = get_branch(inode, k, offsets, chain, &err); if (!partial) partial = chain + k-1; /* diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index f0677ea0ec24..110e8a272189 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -32,6 +32,18 @@ */ static DEFINE_MUTEX(eventfs_mutex); +/* Choose something "unique" ;-) */ +#define EVENTFS_FILE_INODE_INO 0x12c4e37 + +/* Just try to make something consistent and unique */ +static int eventfs_dir_ino(struct eventfs_inode *ei) +{ + if (!ei->ino) + ei->ino = get_next_ino(); + + return ei->ino; +} + /* * The eventfs_inode (ei) itself is protected by SRCU. It is released from * its parent's list and will have is_freed set (under eventfs_mutex). @@ -45,16 +57,55 @@ enum { EVENTFS_SAVE_MODE = BIT(16), EVENTFS_SAVE_UID = BIT(17), EVENTFS_SAVE_GID = BIT(18), + EVENTFS_TOPLEVEL = BIT(19), }; #define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1) +/* + * eventfs_inode reference count management. + * + * NOTE! We count only references from dentries, in the + * form 'dentry->d_fsdata'. There are also references from + * directory inodes ('ti->private'), but the dentry reference + * count is always a superset of the inode reference count. + */ +static void release_ei(struct kref *ref) +{ + struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref); + + WARN_ON_ONCE(!ei->is_freed); + + kfree(ei->entry_attrs); + kfree_const(ei->name); + kfree_rcu(ei, rcu); +} + +static inline void put_ei(struct eventfs_inode *ei) +{ + if (ei) + kref_put(&ei->kref, release_ei); +} + +static inline void free_ei(struct eventfs_inode *ei) +{ + if (ei) { + ei->is_freed = 1; + put_ei(ei); + } +} + +static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei) +{ + if (ei) + kref_get(&ei->kref); + return ei; +} + static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); -static int dcache_dir_open_wrapper(struct inode *inode, struct file *file); -static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx); -static int eventfs_release(struct inode *inode, struct file *file); +static int eventfs_iterate(struct file *file, struct dir_context *ctx); static void update_attr(struct eventfs_attr *attr, struct iattr *iattr) { @@ -94,7 +145,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, /* Preallocate the children mode array if necessary */ if (!(dentry->d_inode->i_mode & S_IFDIR)) { if (!ei->entry_attrs) { - ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries, + ei->entry_attrs = kcalloc(ei->nr_entries, sizeof(*ei->entry_attrs), GFP_NOFS); if (!ei->entry_attrs) { ret = -ENOMEM; @@ -117,10 +168,17 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, * The events directory dentry is never freed, unless its * part of an instance that is deleted. It's attr is the * default for its child files and directories. - * Do not update it. It's not used for its own mode or ownership + * Do not update it. It's not used for its own mode or ownership. */ - if (!ei->is_events) + if (ei->is_events) { + /* But it still needs to know if it was modified */ + if (iattr->ia_valid & ATTR_UID) + ei->attr.mode |= EVENTFS_SAVE_UID; + if (iattr->ia_valid & ATTR_GID) + ei->attr.mode |= EVENTFS_SAVE_GID; + } else { update_attr(&ei->attr, iattr); + } } else { name = dentry->d_name.name; @@ -138,9 +196,63 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } +static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb) +{ + struct inode *root; + + /* Only update if the "events" was on the top level */ + if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL)) + return; + + /* Get the tracefs root inode. */ + root = d_inode(sb->s_root); + ei->attr.uid = root->i_uid; + ei->attr.gid = root->i_gid; +} + +static void set_top_events_ownership(struct inode *inode) +{ + struct tracefs_inode *ti = get_tracefs(inode); + struct eventfs_inode *ei = ti->private; + + /* The top events directory doesn't get automatically updated */ + if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL)) + return; + + update_top_events_attr(ei, inode->i_sb); + + if (!(ei->attr.mode & EVENTFS_SAVE_UID)) + inode->i_uid = ei->attr.uid; + + if (!(ei->attr.mode & EVENTFS_SAVE_GID)) + inode->i_gid = ei->attr.gid; +} + +static int eventfs_get_attr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = d_backing_inode(dentry); + + set_top_events_ownership(inode); + + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +static int eventfs_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + set_top_events_ownership(inode); + return generic_permission(idmap, inode, mask); +} + static const struct inode_operations eventfs_root_dir_inode_operations = { .lookup = eventfs_root_lookup, .setattr = eventfs_set_attr, + .getattr = eventfs_get_attr, + .permission = eventfs_permission, }; static const struct inode_operations eventfs_file_inode_operations = { @@ -148,11 +260,9 @@ static const struct inode_operations eventfs_file_inode_operations = { }; static const struct file_operations eventfs_file_operations = { - .open = dcache_dir_open_wrapper, .read = generic_read_dir, - .iterate_shared = dcache_readdir_wrapper, + .iterate_shared = eventfs_iterate, .llseek = generic_file_llseek, - .release = eventfs_release, }; /* Return the evenfs_inode of the "events" directory */ @@ -160,10 +270,11 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) { struct eventfs_inode *ei; - mutex_lock(&eventfs_mutex); do { - /* The parent always has an ei, except for events itself */ - ei = dentry->d_parent->d_fsdata; + // The parent is stable because we do not do renames + dentry = dentry->d_parent; + // ... and directories always have d_fsdata + ei = dentry->d_fsdata; /* * If the ei is being freed, the ownership of the children @@ -173,10 +284,10 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) ei = NULL; break; } - - dentry = ei->dentry; + // Walk upwards until you find the events inode } while (!ei->is_events); - mutex_unlock(&eventfs_mutex); + + update_top_events_attr(ei, dentry->d_sb); return ei; } @@ -206,50 +317,11 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode, inode->i_gid = attr->gid; } -static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level) -{ - struct eventfs_inode *ei_child; - - /* at most we have events/system/event */ - if (WARN_ON_ONCE(level > 3)) - return; - - ei->attr.gid = gid; - - if (ei->entry_attrs) { - for (int i = 0; i < ei->nr_entries; i++) { - ei->entry_attrs[i].gid = gid; - } - } - - /* - * Only eventfs_inode with dentries are updated, make sure - * all eventfs_inodes are updated. If one of the children - * do not have a dentry, this function must traverse it. - */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - if (!ei_child->dentry) - update_gid(ei_child, gid, level + 1); - } -} - -void eventfs_update_gid(struct dentry *dentry, kgid_t gid) -{ - struct eventfs_inode *ei = dentry->d_fsdata; - int idx; - - idx = srcu_read_lock(&eventfs_srcu); - update_gid(ei, gid, 0); - srcu_read_unlock(&eventfs_srcu, idx); -} - /** - * create_file - create a file in the tracefs filesystem - * @name: the name of the file to create. + * lookup_file - look up a file in the tracefs filesystem + * @dentry: the dentry to look up * @mode: the permission that the file should have. * @attr: saved attributes changed by user - * @parent: parent dentry for this file. * @data: something that the caller will want to get to later on. * @fop: struct file_operations that should be used for this file. * @@ -257,30 +329,25 @@ void eventfs_update_gid(struct dentry *dentry, kgid_t gid) * directory. The inode.i_private pointer will point to @data in the open() * call. */ -static struct dentry *create_file(const char *name, umode_t mode, +static struct dentry *lookup_file(struct eventfs_inode *parent_ei, + struct dentry *dentry, + umode_t mode, struct eventfs_attr *attr, - struct dentry *parent, void *data, + void *data, const struct file_operations *fop) { struct tracefs_inode *ti; - struct dentry *dentry; struct inode *inode; if (!(mode & S_IFMT)) mode |= S_IFREG; if (WARN_ON_ONCE(!S_ISREG(mode))) - return NULL; - - WARN_ON_ONCE(!parent); - dentry = eventfs_start_creating(name, parent); - - if (IS_ERR(dentry)) - return dentry; + return ERR_PTR(-EIO); inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return eventfs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); /* If the user updated the directory's attributes, use them */ update_inode_attr(dentry, inode, attr, mode); @@ -289,34 +356,36 @@ static struct dentry *create_file(const char *name, umode_t mode, inode->i_fop = fop; inode->i_private = data; + /* All files will have the same inode number */ + inode->i_ino = EVENTFS_FILE_INODE_INO; + ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; - d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); - return eventfs_end_creating(dentry); + + // Files have their parent's ei as their fsdata + dentry->d_fsdata = get_ei(parent_ei); + + d_add(dentry, inode); + return NULL; }; /** - * create_dir - create a dir in the tracefs filesystem + * lookup_dir_entry - look up a dir in the tracefs filesystem + * @dentry: the directory to look up * @ei: the eventfs_inode that represents the directory to create - * @parent: parent dentry for this file. * - * This function will create a dentry for a directory represented by + * This function will look up a dentry for a directory represented by * a eventfs_inode. */ -static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent) +static struct dentry *lookup_dir_entry(struct dentry *dentry, + struct eventfs_inode *pei, struct eventfs_inode *ei) { struct tracefs_inode *ti; - struct dentry *dentry; struct inode *inode; - dentry = eventfs_start_creating(ei->name, parent); - if (IS_ERR(dentry)) - return dentry; - inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return eventfs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); /* If the user updated the directory's attributes, use them */ update_inode_attr(dentry, inode, &ei->attr, @@ -325,247 +394,72 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; + /* All directories will have the same inode number */ + inode->i_ino = eventfs_dir_ino(ei); + ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; + /* Only directories have ti->private set to an ei, not files */ + ti->private = ei; - inc_nlink(inode); - d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); - return eventfs_end_creating(dentry); + dentry->d_fsdata = get_ei(ei); + + d_add(dentry, inode); + return NULL; } -static void free_ei(struct eventfs_inode *ei) +static inline struct eventfs_inode *alloc_ei(const char *name) { - kfree_const(ei->name); - kfree(ei->d_children); - kfree(ei->entry_attrs); - kfree(ei); + struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL); + + if (!ei) + return NULL; + + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) { + kfree(ei); + return NULL; + } + kref_init(&ei->kref); + return ei; } /** - * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode - * @ti: the tracefs_inode of the dentry + * eventfs_d_release - dentry is going away * @dentry: dentry which has the reference to remove. * * Remove the association between a dentry from an eventfs_inode. */ -void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) +void eventfs_d_release(struct dentry *dentry) { - struct eventfs_inode *ei; - int i; - - mutex_lock(&eventfs_mutex); - - ei = dentry->d_fsdata; - if (!ei) - goto out; - - /* This could belong to one of the files of the ei */ - if (ei->dentry != dentry) { - for (i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i] == dentry) - break; - } - if (WARN_ON_ONCE(i == ei->nr_entries)) - goto out; - ei->d_children[i] = NULL; - } else if (ei->is_freed) { - free_ei(ei); - } else { - ei->dentry = NULL; - } - - dentry->d_fsdata = NULL; - out: - mutex_unlock(&eventfs_mutex); + put_ei(dentry->d_fsdata); } /** - * create_file_dentry - create a dentry for a file of an eventfs_inode + * lookup_file_dentry - create a dentry for a file of an eventfs_inode * @ei: the eventfs_inode that the file will be created under - * @idx: the index into the d_children[] of the @ei + * @idx: the index into the entry_attrs[] of the @ei * @parent: The parent dentry of the created file. * @name: The name of the file to create * @mode: The mode of the file. * @data: The data to use to set the inode of the file with on open() * @fops: The fops of the file to be created. - * @lookup: If called by the lookup routine, in which case, dput() the created dentry. * * Create a dentry for a file of an eventfs_inode @ei and place it into the - * address located at @e_dentry. If the @e_dentry already has a dentry, then - * just do a dget() on it and return. Otherwise create the dentry and attach it. + * address located at @e_dentry. */ static struct dentry * -create_file_dentry(struct eventfs_inode *ei, int idx, - struct dentry *parent, const char *name, umode_t mode, void *data, - const struct file_operations *fops, bool lookup) +lookup_file_dentry(struct dentry *dentry, + struct eventfs_inode *ei, int idx, + umode_t mode, void *data, + const struct file_operations *fops) { struct eventfs_attr *attr = NULL; - struct dentry **e_dentry = &ei->d_children[idx]; - struct dentry *dentry; - - WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); - mutex_lock(&eventfs_mutex); - if (ei->is_freed) { - mutex_unlock(&eventfs_mutex); - return NULL; - } - /* If the e_dentry already has a dentry, use it */ - if (*e_dentry) { - /* lookup does not need to up the ref count */ - if (!lookup) - dget(*e_dentry); - mutex_unlock(&eventfs_mutex); - return *e_dentry; - } - - /* ei->entry_attrs are protected by SRCU */ if (ei->entry_attrs) attr = &ei->entry_attrs[idx]; - mutex_unlock(&eventfs_mutex); - - dentry = create_file(name, mode, attr, parent, data, fops); - - mutex_lock(&eventfs_mutex); - - if (IS_ERR_OR_NULL(dentry)) { - /* - * When the mutex was released, something else could have - * created the dentry for this e_dentry. In which case - * use that one. - * - * If ei->is_freed is set, the e_dentry is currently on its - * way to being freed, don't return it. If e_dentry is NULL - * it means it was already freed. - */ - if (ei->is_freed) - dentry = NULL; - else - dentry = *e_dentry; - /* The lookup does not need to up the dentry refcount */ - if (dentry && !lookup) - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - - if (!*e_dentry && !ei->is_freed) { - *e_dentry = dentry; - dentry->d_fsdata = ei; - } else { - /* - * Should never happen unless we get here due to being freed. - * Otherwise it means two dentries exist with the same name. - */ - WARN_ON_ONCE(!ei->is_freed); - dentry = NULL; - } - mutex_unlock(&eventfs_mutex); - - if (lookup) - dput(dentry); - - return dentry; -} - -/** - * eventfs_post_create_dir - post create dir routine - * @ei: eventfs_inode of recently created dir - * - * Map the meta-data of files within an eventfs dir to their parent dentry - */ -static void eventfs_post_create_dir(struct eventfs_inode *ei) -{ - struct eventfs_inode *ei_child; - struct tracefs_inode *ti; - - lockdep_assert_held(&eventfs_mutex); - - /* srcu lock already held */ - /* fill parent-child relation */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - ei_child->d_parent = ei->dentry; - } - - ti = get_tracefs(ei->dentry->d_inode); - ti->private = ei; -} - -/** - * create_dir_dentry - Create a directory dentry for the eventfs_inode - * @pei: The eventfs_inode parent of ei. - * @ei: The eventfs_inode to create the directory for - * @parent: The dentry of the parent of this directory - * @lookup: True if this is called by the lookup code - * - * This creates and attaches a directory dentry to the eventfs_inode @ei. - */ -static struct dentry * -create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, - struct dentry *parent, bool lookup) -{ - struct dentry *dentry = NULL; - - WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); - - mutex_lock(&eventfs_mutex); - if (pei->is_freed || ei->is_freed) { - mutex_unlock(&eventfs_mutex); - return NULL; - } - if (ei->dentry) { - /* If the dentry already has a dentry, use it */ - dentry = ei->dentry; - /* lookup does not need to up the ref count */ - if (!lookup) - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - mutex_unlock(&eventfs_mutex); - - dentry = create_dir(ei, parent); - - mutex_lock(&eventfs_mutex); - - if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) { - /* - * When the mutex was released, something else could have - * created the dentry for this e_dentry. In which case - * use that one. - * - * If ei->is_freed is set, the e_dentry is currently on its - * way to being freed. - */ - dentry = ei->dentry; - if (dentry && !lookup) - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - - if (!ei->dentry && !ei->is_freed) { - ei->dentry = dentry; - eventfs_post_create_dir(ei); - dentry->d_fsdata = ei; - } else { - /* - * Should never happen unless we get here due to being freed. - * Otherwise it means two dentries exist with the same name. - */ - WARN_ON_ONCE(!ei->is_freed); - dentry = NULL; - } - mutex_unlock(&eventfs_mutex); - - if (lookup) - dput(dentry); - - return dentry; + return lookup_file(ei, dentry, mode, attr, data, fops); } /** @@ -582,250 +476,153 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - const struct file_operations *fops; - const struct eventfs_entry *entry; struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct dentry *ei_dentry = NULL; - struct dentry *ret = NULL; const char *name = dentry->d_name.name; - bool created = false; - umode_t mode; - void *data; - int idx; - int i; - int r; + struct dentry *result = NULL; ti = get_tracefs(dir); if (!(ti->flags & TRACEFS_EVENT_INODE)) - return NULL; - - /* Grab srcu to prevent the ei from going away */ - idx = srcu_read_lock(&eventfs_srcu); + return ERR_PTR(-EIO); - /* - * Grab the eventfs_mutex to consistent value from ti->private. - * This s - */ mutex_lock(&eventfs_mutex); - ei = READ_ONCE(ti->private); - if (ei && !ei->is_freed) - ei_dentry = READ_ONCE(ei->dentry); - mutex_unlock(&eventfs_mutex); - if (!ei || !ei_dentry) + ei = ti->private; + if (!ei || ei->is_freed) goto out; - data = ei->data; - - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { + list_for_each_entry(ei_child, &ei->children, list) { if (strcmp(ei_child->name, name) != 0) continue; - ret = simple_lookup(dir, dentry, flags); - if (IS_ERR(ret)) + if (ei_child->is_freed) goto out; - create_dir_dentry(ei, ei_child, ei_dentry, true); - created = true; - break; - } - - if (created) + result = lookup_dir_entry(dentry, ei, ei_child); goto out; - - for (i = 0; i < ei->nr_entries; i++) { - entry = &ei->entries[i]; - if (strcmp(name, entry->name) == 0) { - void *cdata = data; - mutex_lock(&eventfs_mutex); - /* If ei->is_freed, then the event itself may be too */ - if (!ei->is_freed) - r = entry->callback(name, &mode, &cdata, &fops); - else - r = -1; - mutex_unlock(&eventfs_mutex); - if (r <= 0) - continue; - ret = simple_lookup(dir, dentry, flags); - if (IS_ERR(ret)) - goto out; - create_file_dentry(ei, i, ei_dentry, name, mode, cdata, - fops, true); - break; - } } - out: - srcu_read_unlock(&eventfs_srcu, idx); - return ret; -} - -struct dentry_list { - void *cursor; - struct dentry **dentries; -}; -/** - * eventfs_release - called to release eventfs file/dir - * @inode: inode to be released - * @file: file to be released (not used) - */ -static int eventfs_release(struct inode *inode, struct file *file) -{ - struct tracefs_inode *ti; - struct dentry_list *dlist = file->private_data; - void *cursor; - int i; + for (int i = 0; i < ei->nr_entries; i++) { + void *data; + umode_t mode; + const struct file_operations *fops; + const struct eventfs_entry *entry = &ei->entries[i]; - ti = get_tracefs(inode); - if (!(ti->flags & TRACEFS_EVENT_INODE)) - return -EINVAL; + if (strcmp(name, entry->name) != 0) + continue; - if (WARN_ON_ONCE(!dlist)) - return -EINVAL; + data = ei->data; + if (entry->callback(name, &mode, &data, &fops) <= 0) + goto out; - for (i = 0; dlist->dentries && dlist->dentries[i]; i++) { - dput(dlist->dentries[i]); + result = lookup_file_dentry(dentry, ei, i, mode, data, fops); + goto out; } - - cursor = dlist->cursor; - kfree(dlist->dentries); - kfree(dlist); - file->private_data = cursor; - return dcache_dir_close(inode, file); -} - -static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt) -{ - struct dentry **tmp; - - tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS); - if (!tmp) - return -1; - tmp[cnt] = d; - tmp[cnt + 1] = NULL; - *dentries = tmp; - return 0; + out: + mutex_unlock(&eventfs_mutex); + return result; } -/** - * dcache_dir_open_wrapper - eventfs open wrapper - * @inode: not used - * @file: dir to be opened (to create it's children) - * - * Used to dynamic create file/dir with-in @file, all the - * file/dir will be created. If already created then references - * will be increased +/* + * Walk the children of a eventfs_inode to fill in getdents(). */ -static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) +static int eventfs_iterate(struct file *file, struct dir_context *ctx) { const struct file_operations *fops; + struct inode *f_inode = file_inode(file); const struct eventfs_entry *entry; struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct dentry_list *dlist; - struct dentry **dentries = NULL; - struct dentry *parent = file_dentry(file); - struct dentry *d; - struct inode *f_inode = file_inode(file); - const char *name = parent->d_name.name; + const char *name; umode_t mode; - void *data; - int cnt = 0; int idx; - int ret; - int i; - int r; + int ret = -EINVAL; + int ino; + int i, r, c; + + if (!dir_emit_dots(file, ctx)) + return 0; ti = get_tracefs(f_inode); if (!(ti->flags & TRACEFS_EVENT_INODE)) return -EINVAL; - if (WARN_ON_ONCE(file->private_data)) - return -EINVAL; + c = ctx->pos - 2; idx = srcu_read_lock(&eventfs_srcu); mutex_lock(&eventfs_mutex); ei = READ_ONCE(ti->private); + if (ei && ei->is_freed) + ei = NULL; mutex_unlock(&eventfs_mutex); - if (!ei) { - srcu_read_unlock(&eventfs_srcu, idx); - return -EINVAL; - } - + if (!ei) + goto out; - data = ei->data; + /* + * Need to create the dentries and inodes to have a consistent + * inode number. + */ + ret = 0; - dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); - if (!dlist) { - srcu_read_unlock(&eventfs_srcu, idx); - return -ENOMEM; - } + /* Start at 'c' to jump over already read entries */ + for (i = c; i < ei->nr_entries; i++, ctx->pos++) { + void *cdata = ei->data; - inode_lock(parent->d_inode); - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - d = create_dir_dentry(ei, ei_child, parent, false); - if (d) { - ret = add_dentries(&dentries, d, cnt); - if (ret < 0) - break; - cnt++; - } - } - - for (i = 0; i < ei->nr_entries; i++) { - void *cdata = data; entry = &ei->entries[i]; name = entry->name; + mutex_lock(&eventfs_mutex); - /* If ei->is_freed, then the event itself may be too */ - if (!ei->is_freed) - r = entry->callback(name, &mode, &cdata, &fops); - else - r = -1; + /* If ei->is_freed then just bail here, nothing more to do */ + if (ei->is_freed) { + mutex_unlock(&eventfs_mutex); + goto out; + } + r = entry->callback(name, &mode, &cdata, &fops); mutex_unlock(&eventfs_mutex); if (r <= 0) continue; - d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false); - if (d) { - ret = add_dentries(&dentries, d, cnt); - if (ret < 0) - break; - cnt++; + + ino = EVENTFS_FILE_INODE_INO; + + if (!dir_emit(ctx, name, strlen(name), ino, DT_REG)) + goto out; + } + + /* Subtract the skipped entries above */ + c -= min((unsigned int)c, (unsigned int)ei->nr_entries); + + list_for_each_entry_srcu(ei_child, &ei->children, list, + srcu_read_lock_held(&eventfs_srcu)) { + + if (c > 0) { + c--; + continue; } + + ctx->pos++; + + if (ei_child->is_freed) + continue; + + name = ei_child->name; + + ino = eventfs_dir_ino(ei_child); + + if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) + goto out_dec; } - inode_unlock(parent->d_inode); + ret = 1; + out: srcu_read_unlock(&eventfs_srcu, idx); - ret = dcache_dir_open(inode, file); - /* - * dcache_dir_open() sets file->private_data to a dentry cursor. - * Need to save that but also save all the dentries that were - * opened by this function. - */ - dlist->cursor = file->private_data; - dlist->dentries = dentries; - file->private_data = dlist; return ret; -} - -/* - * This just sets the file->private_data back to the cursor and back. - */ -static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) -{ - struct dentry_list *dlist = file->private_data; - int ret; - file->private_data = dlist->cursor; - ret = dcache_readdir(file, ctx); - dlist->cursor = file->private_data; - file->private_data = dlist; - return ret; + out_dec: + /* Incremented ctx->pos without adding something, reset it */ + ctx->pos--; + goto out; } /** @@ -872,25 +669,10 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode if (!parent) return ERR_PTR(-EINVAL); - ei = kzalloc(sizeof(*ei), GFP_KERNEL); + ei = alloc_ei(name); if (!ei) return ERR_PTR(-ENOMEM); - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) { - kfree(ei); - return ERR_PTR(-ENOMEM); - } - - if (size) { - ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); - if (!ei->d_children) { - kfree_const(ei->name); - kfree(ei); - return ERR_PTR(-ENOMEM); - } - } - ei->entries = entries; ei->nr_entries = size; ei->data = data; @@ -898,10 +680,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode INIT_LIST_HEAD(&ei->list); mutex_lock(&eventfs_mutex); - if (!parent->is_freed) { + if (!parent->is_freed) list_add_tail(&ei->list, &parent->children); - ei->d_parent = parent->dentry; - } mutex_unlock(&eventfs_mutex); /* Was the parent freed? */ @@ -941,33 +721,33 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry if (IS_ERR(dentry)) return ERR_CAST(dentry); - ei = kzalloc(sizeof(*ei), GFP_KERNEL); + ei = alloc_ei(name); if (!ei) - goto fail_ei; + goto fail; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) goto fail; - if (size) { - ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); - if (!ei->d_children) - goto fail; - } - - ei->dentry = dentry; + // Note: we have a ref to the dentry from tracefs_start_creating() + ei->events_dir = dentry; ei->entries = entries; ei->nr_entries = size; ei->is_events = 1; ei->data = data; - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) - goto fail; /* Save the ownership of this directory */ uid = d_inode(dentry->d_parent)->i_uid; gid = d_inode(dentry->d_parent)->i_gid; + /* + * If the events directory is of the top instance, then parent + * is NULL. Set the attr.mode to reflect this and its permissions will + * default to the tracefs root dentry. + */ + if (!parent) + ei->attr.mode = EVENTFS_TOPLEVEL; + /* This is used as the default ownership of the files and directories */ ei->attr.uid = uid; ei->attr.gid = gid; @@ -985,11 +765,19 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; - dentry->d_fsdata = ei; + dentry->d_fsdata = get_ei(ei); - /* directory inodes start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); + /* + * Keep all eventfs directories with i_nlink == 1. + * Due to the dynamic nature of the dentry creations and not + * wanting to add a pointer to the parent eventfs_inode in the + * eventfs_inode structure, keeping the i_nlink in sync with the + * number of directories would cause too much complexity for + * something not worth much. Keeping directory links at 1 + * tells userspace not to trust the link number. + */ d_instantiate(dentry, inode); + /* The dentry of the "events" parent does keep track though */ inc_nlink(dentry->d_parent->d_inode); fsnotify_mkdir(dentry->d_parent->d_inode, dentry); tracefs_end_creating(dentry); @@ -997,72 +785,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return ei; fail: - kfree(ei->d_children); - kfree(ei); - fail_ei: + free_ei(ei); tracefs_failed_creating(dentry); return ERR_PTR(-ENOMEM); } -static LLIST_HEAD(free_list); - -static void eventfs_workfn(struct work_struct *work) -{ - struct eventfs_inode *ei, *tmp; - struct llist_node *llnode; - - llnode = llist_del_all(&free_list); - llist_for_each_entry_safe(ei, tmp, llnode, llist) { - /* This dput() matches the dget() from unhook_dentry() */ - for (int i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i]) - dput(ei->d_children[i]); - } - /* This should only get here if it had a dentry */ - if (!WARN_ON_ONCE(!ei->dentry)) - dput(ei->dentry); - } -} - -static DECLARE_WORK(eventfs_work, eventfs_workfn); - -static void free_rcu_ei(struct rcu_head *head) -{ - struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu); - - if (ei->dentry) { - /* Do not free the ei until all references of dentry are gone */ - if (llist_add(&ei->llist, &free_list)) - queue_work(system_unbound_wq, &eventfs_work); - return; - } - - /* If the ei doesn't have a dentry, neither should its children */ - for (int i = 0; i < ei->nr_entries; i++) { - WARN_ON_ONCE(ei->d_children[i]); - } - - free_ei(ei); -} - -static void unhook_dentry(struct dentry *dentry) -{ - if (!dentry) - return; - /* - * Need to add a reference to the dentry that is expected by - * simple_recursive_removal(), which will include a dput(). - */ - dget(dentry); - - /* - * Also add a reference for the dput() in eventfs_workfn(). - * That is required as that dput() will free the ei after - * the SRCU grace period is over. - */ - dget(dentry); -} - /** * eventfs_remove_rec - remove eventfs dir or file from list * @ei: eventfs_inode to be removed. @@ -1075,8 +802,6 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) { struct eventfs_inode *ei_child; - if (!ei) - return; /* * Check recursion depth. It should never be greater than 3: * 0 - events/ @@ -1088,28 +813,11 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) return; /* search for nested folders or files */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - lockdep_is_held(&eventfs_mutex)) { - /* Children only have dentry if parent does */ - WARN_ON_ONCE(ei_child->dentry && !ei->dentry); + list_for_each_entry(ei_child, &ei->children, list) eventfs_remove_rec(ei_child, level + 1); - } - - - ei->is_freed = 1; - for (int i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i]) { - /* Children only have dentry if parent does */ - WARN_ON_ONCE(!ei->dentry); - unhook_dentry(ei->d_children[i]); - } - } - - unhook_dentry(ei->dentry); - - list_del_rcu(&ei->list); - call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); + list_del(&ei->list); + free_ei(ei); } /** @@ -1120,22 +828,12 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) */ void eventfs_remove_dir(struct eventfs_inode *ei) { - struct dentry *dentry; - if (!ei) return; mutex_lock(&eventfs_mutex); - dentry = ei->dentry; eventfs_remove_rec(ei, 0); mutex_unlock(&eventfs_mutex); - - /* - * If any of the ei children has a dentry, then the ei itself - * must have a dentry. - */ - if (dentry) - simple_recursive_removal(dentry, NULL); } /** @@ -1148,7 +846,11 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei) { struct dentry *dentry; - dentry = ei->dentry; + dentry = ei->events_dir; + if (!dentry) + return; + + ei->events_dir = NULL; eventfs_remove_dir(ei); /* @@ -1158,5 +860,6 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei) * sticks around while the other ei->dentry are created * and destroyed dynamically. */ + d_invalidate(dentry); dput(dentry); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index ad20e6af938d..d65ffad4c327 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -38,8 +38,6 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb) if (!ti) return NULL; - ti->flags = 0; - return &ti->vfs_inode; } @@ -91,6 +89,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *dentry, umode_t mode) { + struct tracefs_inode *ti; char *name; int ret; @@ -99,6 +98,15 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, return -ENOMEM; /* + * This is a new directory that does not take the default of + * the rootfs. It becomes the default permissions for all the + * files and directories underneath it. + */ + ti = get_tracefs(inode); + ti->flags |= TRACEFS_INSTANCE_INODE; + ti->private = inode; + + /* * The mkdir call can call the generic functions that create * the files within the tracefs system. It is up to the individual * mkdir routine to handle races. @@ -141,10 +149,76 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) return ret; } -static const struct inode_operations tracefs_dir_inode_operations = { +static void set_tracefs_inode_owner(struct inode *inode) +{ + struct tracefs_inode *ti = get_tracefs(inode); + struct inode *root_inode = ti->private; + + /* + * If this inode has never been referenced, then update + * the permissions to the superblock. + */ + if (!(ti->flags & TRACEFS_UID_PERM_SET)) + inode->i_uid = root_inode->i_uid; + + if (!(ti->flags & TRACEFS_GID_PERM_SET)) + inode->i_gid = root_inode->i_gid; +} + +static int tracefs_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + set_tracefs_inode_owner(inode); + return generic_permission(idmap, inode, mask); +} + +static int tracefs_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct inode *inode = d_backing_inode(path->dentry); + + set_tracefs_inode_owner(inode); + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +static int tracefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + struct inode *inode = d_inode(dentry); + struct tracefs_inode *ti = get_tracefs(inode); + + if (ia_valid & ATTR_UID) + ti->flags |= TRACEFS_UID_PERM_SET; + + if (ia_valid & ATTR_GID) + ti->flags |= TRACEFS_GID_PERM_SET; + + return simple_setattr(idmap, dentry, attr); +} + +static const struct inode_operations tracefs_instance_dir_inode_operations = { .lookup = simple_lookup, .mkdir = tracefs_syscall_mkdir, .rmdir = tracefs_syscall_rmdir, + .permission = tracefs_permission, + .getattr = tracefs_getattr, + .setattr = tracefs_setattr, +}; + +static const struct inode_operations tracefs_dir_inode_operations = { + .lookup = simple_lookup, + .permission = tracefs_permission, + .getattr = tracefs_getattr, + .setattr = tracefs_setattr, +}; + +static const struct inode_operations tracefs_file_inode_operations = { + .permission = tracefs_permission, + .getattr = tracefs_getattr, + .setattr = tracefs_setattr, }; struct inode *tracefs_get_inode(struct super_block *sb) @@ -183,82 +257,6 @@ struct tracefs_fs_info { struct tracefs_mount_opts mount_opts; }; -static void change_gid(struct dentry *dentry, kgid_t gid) -{ - if (!dentry->d_inode) - return; - dentry->d_inode->i_gid = gid; -} - -/* - * Taken from d_walk, but without he need for handling renames. - * Nothing can be renamed while walking the list, as tracefs - * does not support renames. This is only called when mounting - * or remounting the file system, to set all the files to - * the given gid. - */ -static void set_gid(struct dentry *parent, kgid_t gid) -{ - struct dentry *this_parent, *dentry; - - this_parent = parent; - spin_lock(&this_parent->d_lock); - - change_gid(this_parent, gid); -repeat: - dentry = d_first_child(this_parent); -resume: - hlist_for_each_entry_from(dentry, d_sib) { - struct tracefs_inode *ti; - - /* Note, getdents() can add a cursor dentry with no inode */ - if (!dentry->d_inode) - continue; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - - change_gid(dentry, gid); - - /* If this is the events directory, update that too */ - ti = get_tracefs(dentry->d_inode); - if (ti && (ti->flags & TRACEFS_EVENT_INODE)) - eventfs_update_gid(dentry, gid); - - if (!hlist_empty(&dentry->d_children)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } - spin_unlock(&dentry->d_lock); - } - /* - * All done at this level ... ascend and resume the search. - */ - rcu_read_lock(); -ascend: - if (this_parent != parent) { - dentry = this_parent; - this_parent = dentry->d_parent; - - spin_unlock(&dentry->d_lock); - spin_lock(&this_parent->d_lock); - - /* go into the first sibling still alive */ - hlist_for_each_entry_continue(dentry, d_sib) { - if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { - rcu_read_unlock(); - goto resume; - } - } - goto ascend; - } - rcu_read_unlock(); - spin_unlock(&this_parent->d_lock); - return; -} - static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) { substring_t args[MAX_OPT_ARGS]; @@ -331,10 +329,8 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) if (!remount || opts->opts & BIT(Opt_uid)) inode->i_uid = opts->uid; - if (!remount || opts->opts & BIT(Opt_gid)) { - /* Set all the group ids to the mount option */ - set_gid(sb->s_root, opts->gid); - } + if (!remount || opts->opts & BIT(Opt_gid)) + inode->i_gid = opts->gid; return 0; } @@ -381,21 +377,30 @@ static const struct super_operations tracefs_super_operations = { .show_options = tracefs_show_options, }; -static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) +/* + * It would be cleaner if eventfs had its own dentry ops. + * + * Note that d_revalidate is called potentially under RCU, + * so it can't take the eventfs mutex etc. It's fine - if + * we open a file just as it's marked dead, things will + * still work just fine, and just see the old stale case. + */ +static void tracefs_d_release(struct dentry *dentry) { - struct tracefs_inode *ti; + if (dentry->d_fsdata) + eventfs_d_release(dentry); +} - if (!dentry || !inode) - return; +static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct eventfs_inode *ei = dentry->d_fsdata; - ti = get_tracefs(inode); - if (ti && ti->flags & TRACEFS_EVENT_INODE) - eventfs_set_ei_status_free(ti, dentry); - iput(inode); + return !(ei && ei->is_freed); } static const struct dentry_operations tracefs_dentry_operations = { - .d_iput = tracefs_dentry_iput, + .d_revalidate = tracefs_d_revalidate, + .d_release = tracefs_d_release, }; static int trace_fill_super(struct super_block *sb, void *data, int silent) @@ -499,73 +504,24 @@ struct dentry *tracefs_end_creating(struct dentry *dentry) return dentry; } -/** - * eventfs_start_creating - start the process of creating a dentry - * @name: Name of the file created for the dentry - * @parent: The parent dentry where this dentry will be created - * - * This is a simple helper function for the dynamically created eventfs - * files. When the directory of the eventfs files are accessed, their - * dentries are created on the fly. This function is used to start that - * process. - */ -struct dentry *eventfs_start_creating(const char *name, struct dentry *parent) +/* Find the inode that this will use for default */ +static struct inode *instance_inode(struct dentry *parent, struct inode *inode) { - struct dentry *dentry; - int error; - - /* Must always have a parent. */ - if (WARN_ON_ONCE(!parent)) - return ERR_PTR(-EINVAL); - - error = simple_pin_fs(&trace_fs_type, &tracefs_mount, - &tracefs_mount_count); - if (error) - return ERR_PTR(error); + struct tracefs_inode *ti; - if (unlikely(IS_DEADDIR(parent->d_inode))) - dentry = ERR_PTR(-ENOENT); - else - dentry = lookup_one_len(name, parent, strlen(name)); + /* If parent is NULL then use root inode */ + if (!parent) + return d_inode(inode->i_sb->s_root); - if (!IS_ERR(dentry) && dentry->d_inode) { - dput(dentry); - dentry = ERR_PTR(-EEXIST); + /* Find the inode that is flagged as an instance or the root inode */ + while (!IS_ROOT(parent)) { + ti = get_tracefs(d_inode(parent)); + if (ti->flags & TRACEFS_INSTANCE_INODE) + break; + parent = parent->d_parent; } - if (IS_ERR(dentry)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - - return dentry; -} - -/** - * eventfs_failed_creating - clean up a failed eventfs dentry creation - * @dentry: The dentry to clean up - * - * If after calling eventfs_start_creating(), a failure is detected, the - * resources created by eventfs_start_creating() needs to be cleaned up. In - * that case, this function should be called to perform that clean up. - */ -struct dentry *eventfs_failed_creating(struct dentry *dentry) -{ - dput(dentry); - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - return NULL; -} - -/** - * eventfs_end_creating - Finish the process of creating a eventfs dentry - * @dentry: The dentry that has successfully been created. - * - * This function is currently just a place holder to match - * eventfs_start_creating(). In case any synchronization needs to be added, - * this function will be used to implement that without having to modify - * the callers of eventfs_start_creating(). - */ -struct dentry *eventfs_end_creating(struct dentry *dentry) -{ - return dentry; + return d_inode(parent); } /** @@ -598,6 +554,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { + struct tracefs_inode *ti; struct dentry *dentry; struct inode *inode; @@ -616,7 +573,11 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (unlikely(!inode)) return tracefs_failed_creating(dentry); + ti = get_tracefs(inode); + ti->private = instance_inode(parent, inode); + inode->i_mode = mode; + inode->i_op = &tracefs_file_inode_operations; inode->i_fop = fops ? fops : &tracefs_file_operations; inode->i_private = data; inode->i_uid = d_inode(dentry->d_parent)->i_uid; @@ -629,6 +590,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, static struct dentry *__create_dir(const char *name, struct dentry *parent, const struct inode_operations *ops) { + struct tracefs_inode *ti; struct dentry *dentry = tracefs_start_creating(name, parent); struct inode *inode; @@ -646,6 +608,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, inode->i_uid = d_inode(dentry->d_parent)->i_uid; inode->i_gid = d_inode(dentry->d_parent)->i_gid; + ti = get_tracefs(inode); + ti->private = instance_inode(parent, inode); + /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); @@ -676,7 +641,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) if (security_locked_down(LOCKDOWN_TRACEFS)) return NULL; - return __create_dir(name, parent, &simple_dir_inode_operations); + return __create_dir(name, parent, &tracefs_dir_inode_operations); } /** @@ -707,7 +672,7 @@ __init struct dentry *tracefs_create_instance_dir(const char *name, if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir)) return NULL; - dentry = __create_dir(name, parent, &tracefs_dir_inode_operations); + dentry = __create_dir(name, parent, &tracefs_instance_dir_inode_operations); if (!dentry) return NULL; @@ -752,7 +717,11 @@ static void init_once(void *foo) { struct tracefs_inode *ti = (struct tracefs_inode *) foo; + /* inode_init_once() calls memset() on the vfs_inode portion */ inode_init_once(&ti->vfs_inode); + + /* Zero out the rest */ + memset_after(ti, 0, vfs_inode); } static int __init tracefs_init(void) diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 42bdeb471a07..beb3dcd0e434 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -5,12 +5,16 @@ enum { TRACEFS_EVENT_INODE = BIT(1), TRACEFS_EVENT_TOP_INODE = BIT(2), + TRACEFS_GID_PERM_SET = BIT(3), + TRACEFS_UID_PERM_SET = BIT(4), + TRACEFS_INSTANCE_INODE = BIT(5), }; struct tracefs_inode { + struct inode vfs_inode; + /* The below gets initialized with memset_after(ti, 0, vfs_inode) */ unsigned long flags; void *private; - struct inode vfs_inode; }; /* @@ -28,42 +32,37 @@ struct eventfs_attr { /* * struct eventfs_inode - hold the properties of the eventfs directories. * @list: link list into the parent directory + * @rcu: Union with @list for freeing + * @children: link list into the child eventfs_inode * @entries: the array of entries representing the files in the directory * @name: the name of the directory to create - * @children: link list into the child eventfs_inode - * @dentry: the dentry of the directory - * @d_parent: pointer to the parent's dentry - * @d_children: The array of dentries to represent the files when created + * @events_dir: the dentry of the events directory * @entry_attrs: Saved mode and ownership of the @d_children - * @attr: Saved mode and ownership of eventfs_inode itself * @data: The private data to pass to the callbacks + * @attr: Saved mode and ownership of eventfs_inode itself * @is_freed: Flag set if the eventfs is on its way to be freed * Note if is_freed is set, then dentry is corrupted. + * @is_events: Flag set for only the top level "events" directory * @nr_entries: The number of items in @entries + * @ino: The saved inode number */ struct eventfs_inode { - struct list_head list; + union { + struct list_head list; + struct rcu_head rcu; + }; + struct list_head children; const struct eventfs_entry *entries; const char *name; - struct list_head children; - struct dentry *dentry; /* Check is_freed to access */ - struct dentry *d_parent; - struct dentry **d_children; + struct dentry *events_dir; struct eventfs_attr *entry_attrs; - struct eventfs_attr attr; void *data; - /* - * Union - used for deletion - * @llist: for calling dput() if needed after RCU - * @rcu: eventfs_inode to delete in RCU - */ - union { - struct llist_node llist; - struct rcu_head rcu; - }; + struct eventfs_attr attr; + struct kref kref; unsigned int is_freed:1; unsigned int is_events:1; unsigned int nr_entries:30; + unsigned int ino; }; static inline struct tracefs_inode *get_tracefs(const struct inode *inode) @@ -75,10 +74,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent); struct dentry *tracefs_end_creating(struct dentry *dentry); struct dentry *tracefs_failed_creating(struct dentry *dentry); struct inode *tracefs_get_inode(struct super_block *sb); -struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); -struct dentry *eventfs_failed_creating(struct dentry *dentry); -struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_update_gid(struct dentry *dentry, kgid_t gid); -void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry); + +void eventfs_d_release(struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 0d561ecb6869..a4a0158f712d 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -18,7 +18,7 @@ #include "ubifs.h" /** - * ubifs_node_calc_hash - calculate the hash of a UBIFS node + * __ubifs_node_calc_hash - calculate the hash of a UBIFS node * @c: UBIFS file-system description object * @node: the node to calculate a hash for * @hash: the returned hash @@ -507,28 +507,13 @@ out: */ int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac) { - SHASH_DESC_ON_STACK(shash, c->hmac_tfm); - int err; const char well_known_message[] = "UBIFS"; if (!ubifs_authenticated(c)) return 0; - shash->tfm = c->hmac_tfm; - - err = crypto_shash_init(shash); - if (err) - return err; - - err = crypto_shash_update(shash, well_known_message, - sizeof(well_known_message) - 1); - if (err < 0) - return err; - - err = crypto_shash_final(shash, hmac); - if (err) - return err; - return 0; + return crypto_shash_tfm_digest(c->hmac_tfm, well_known_message, + sizeof(well_known_message) - 1, hmac); } /* diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index c4fc1047fc07..5b3a840098b0 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -70,18 +70,29 @@ static int nothing_to_commit(struct ubifs_info *c) return 0; /* + * Increasing @c->dirty_pn_cnt/@c->dirty_nn_cnt and marking + * nnodes/pnodes as dirty in run_gc() could race with following + * checking, which leads inconsistent states between @c->nroot + * and @c->dirty_pn_cnt/@c->dirty_nn_cnt, holding @c->lp_mutex + * to avoid that. + */ + mutex_lock(&c->lp_mutex); + /* * Even though the TNC is clean, the LPT tree may have dirty nodes. For * example, this may happen if the budgeting subsystem invoked GC to * make some free space, and the GC found an LEB with only dirty and * free space. In this case GC would just change the lprops of this * LEB (by turning all space into free space) and unmap it. */ - if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) + if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) { + mutex_unlock(&c->lp_mutex); return 0; + } ubifs_assert(c, atomic_long_read(&c->dirty_zn_cnt) == 0); ubifs_assert(c, c->dirty_pn_cnt == 0); ubifs_assert(c, c->dirty_nn_cnt == 0); + mutex_unlock(&c->lp_mutex); return 1; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 3b13c648d490..551148de66cd 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -205,7 +205,6 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); err = fscrypt_prepare_lookup(dir, dentry, &nm); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return d_splice_alias(NULL, dentry); if (err) @@ -1234,6 +1233,8 @@ out_cancel: dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); out_inode: + /* Free inode->i_link before inode is marked as bad. */ + fscrypt_free_inode(inode); make_bad_inode(inode); iput(inode); out_fname: diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2d2b39f843ce..5029eb3390a5 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -318,8 +318,9 @@ static int write_begin_slow(struct address_space *mapping, * This is a helper function for 'ubifs_write_begin()' which allocates budget * for the operation. The budget is allocated differently depending on whether * this is appending, whether the page is dirty or not, and so on. This - * function leaves the @ui->ui_mutex locked in case of appending. Returns zero - * in case of success and %-ENOSPC in case of failure. + * function leaves the @ui->ui_mutex locked in case of appending. + * + * Returns: %0 in case of success and %-ENOSPC in case of failure. */ static int allocate_budget(struct ubifs_info *c, struct page *page, struct ubifs_inode *ui, int appending) @@ -600,7 +601,7 @@ out: * @bu: bulk-read information * @n: next zbranch slot * - * This function returns %0 on success and a negative error code on failure. + * Returns: %0 on success and a negative error code on failure. */ static int populate_page(struct ubifs_info *c, struct page *page, struct bu_info *bu, int *n) @@ -711,7 +712,7 @@ out_err: * @bu: bulk-read information * @page1: first page to read * - * This function returns %1 if the bulk-read is done, otherwise %0 is returned. + * Returns: %1 if the bulk-read is done, otherwise %0 is returned. */ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, struct page *page1) @@ -821,7 +822,9 @@ out_bu_off: * Some flash media are capable of reading sequentially at faster rates. UBIFS * bulk-read facility is designed to take advantage of that, by reading in one * go consecutive data nodes that are also located consecutively in the same - * LEB. This function returns %1 if a bulk-read is done and %0 otherwise. + * LEB. + * + * Returns: %1 if a bulk-read is done and %0 otherwise. */ static int ubifs_bulk_read(struct page *page) { @@ -1109,7 +1112,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) * @attr: inode attribute changes description * * This function implements VFS '->setattr()' call when the inode is truncated - * to a smaller size. Returns zero in case of success and a negative error code + * to a smaller size. + * + * Returns: %0 in case of success and a negative error code * in case of failure. */ static int do_truncation(struct ubifs_info *c, struct inode *inode, @@ -1215,7 +1220,9 @@ out_budg: * @attr: inode attribute changes description * * This function implements VFS '->setattr()' call for all cases except - * truncations to smaller size. Returns zero in case of success and a negative + * truncations to smaller size. + * + * Returns: %0 in case of success and a negative * error code in case of failure. */ static int do_setattr(struct ubifs_info *c, struct inode *inode, @@ -1360,6 +1367,8 @@ out: * This helper function checks if the inode mtime/ctime should be updated or * not. If current values of the time-stamps are within the UBIFS inode time * granularity, they are not updated. This is an optimization. + * + * Returns: %1 if time update is needed, %0 if not */ static inline int mctime_update_needed(const struct inode *inode, const struct timespec64 *now) @@ -1375,11 +1384,12 @@ static inline int mctime_update_needed(const struct inode *inode, /** * ubifs_update_time - update time of inode. * @inode: inode to update - * @time: timespec structure to hold the current time value * @flags: time updating control flag determines updating * which time fields of @inode * * This function updates time of the inode. + * + * Returns: %0 for success or a negative error code otherwise. */ int ubifs_update_time(struct inode *inode, int flags) { @@ -1413,7 +1423,9 @@ int ubifs_update_time(struct inode *inode, int flags) * @inode: inode to update * * This function updates mtime and ctime of the inode if it is not equivalent to - * current time. Returns zero in case of success and a negative error code in + * current time. + * + * Returns: %0 in case of success and a negative error code in * case of failure. */ static int update_mctime(struct inode *inode) diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index c59d47fe7939..17da28d6247a 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -365,6 +365,7 @@ static void destroy_replay_list(struct ubifs_info *c) * @lnum: node logical eraseblock number * @offs: node offset * @len: node length + * @hash: node hash * @key: node key * @sqnum: sequence number * @deletion: non-zero if this is a deletion @@ -417,6 +418,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, * @lnum: node logical eraseblock number * @offs: node offset * @len: node length + * @hash: node hash * @key: node key * @name: directory entry name * @nlen: directory entry name length diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 09e270d6ed02..d2881041b393 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2239,13 +2239,14 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out_umount; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { err = -ENOMEM; goto out_umount; } - import_uuid(&sb->s_uuid, c->uuid); + super_set_uuid(sb, c->uuid, sizeof(c->uuid)); mutex_unlock(&c->umount_mutex); return 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4fcefe5ef7cb..959551ff9a95 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1032,7 +1032,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new, { int fd; - fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new, O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -2260,7 +2260,8 @@ static int new_userfaultfd(int flags) /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, + /* Create a new inode so that the LSM can block the creation. */ + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx, O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 9976a00a73f9..e965a48e7db9 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -421,10 +421,10 @@ xfs_attr_complete_op( bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; args->op_flags &= ~XFS_DA_OP_REPLACE; - if (do_replace) { - args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + if (do_replace) return replace_state; - } + return XFS_DAS_DONE; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 98aaca933bdd..f362345467fa 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3277,7 +3277,7 @@ xfs_bmap_alloc_account( struct xfs_bmalloca *ap) { bool isrt = XFS_IS_REALTIME_INODE(ap->ip) && - (ap->flags & XFS_BMAPI_ATTRFORK); + !(ap->flags & XFS_BMAPI_ATTRFORK); uint fld; if (ap->flags & XFS_BMAPI_COWFORK) { diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 31100120b2c5..e31663cb7b43 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1119,20 +1119,6 @@ xfs_rtbitmap_blockcount( } /* - * Compute the maximum level number of the realtime summary file, as defined by - * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct - * use of rt volumes with more than 2^32 extents. - */ -uint8_t -xfs_compute_rextslog( - xfs_rtbxlen_t rtextents) -{ - if (!rtextents) - return 0; - return xfs_highbit64(rtextents); -} - -/* * Compute the number of rtbitmap words needed to populate every block of a * bitmap that is large enough to track the given number of rt extents. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 274dc7dae1fa..152a66750af5 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -351,20 +351,6 @@ xfs_rtfree_extent( int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); -uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); - -/* Do we support an rt volume having this number of rtextents? */ -static inline bool -xfs_validate_rtextents( - xfs_rtbxlen_t rtextents) -{ - /* No runt rt volumes */ - if (rtextents == 0) - return false; - - return true; -} - xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -383,8 +369,6 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) -# define xfs_compute_rextslog(rtx) (0) -# define xfs_validate_rtextents(rtx) (false) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4a9e8588f4c9..5bb6e2bd6dee 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1377,3 +1377,17 @@ xfs_validate_stripe_geometry( } return true; } + +/* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 19134b23c10b..2e8e8d63d4eb 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -38,4 +38,6 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, __s64 sunit, __s64 swidth, int sectorsize, bool silent); +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 20b5375f2d9c..62e02d5380ad 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -251,4 +251,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off); bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off, xfs_fileoff_t len); +/* Do we support an rt volume having this number of rtextents? */ +static inline bool +xfs_validate_rtextents( + xfs_rtbxlen_t rtextents) +{ + /* No runt rt volumes */ + if (rtextents == 0) + return false; + + return true; +} + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 441ca9977652..46583517377f 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -15,6 +15,7 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bit.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index fabd0ed9dfa6..b1ff4f33324a 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -16,6 +16,7 @@ #include "xfs_rtbitmap.h" #include "xfs_bit.h" #include "xfs_bmap.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 813f85156b0c..1698507d1ac7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -112,7 +112,7 @@ xfs_end_ioend( * longer dirty. If we don't remove delalloc blocks here, they become * stale and can corrupt free space accounting on unmount. */ - error = blk_status_to_errno(ioend->io_bio->bi_status); + error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); @@ -179,7 +179,7 @@ STATIC void xfs_end_bio( struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); unsigned long flags; @@ -276,7 +276,8 @@ static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, + unsigned int len) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -444,7 +445,7 @@ xfs_prepare_ioend( /* send ioends that might require a transaction to the completion wq */ if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || (ioend->io_flags & IOMAP_F_SHARED)) - ioend->io_bio->bi_end_io = xfs_end_bio; + ioend->io_bio.bi_end_io = xfs_end_bio; return status; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8e5bd50d29fe..01b41fabbe3c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1951,7 +1951,7 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - bdev_release(btp->bt_bdev_handle); + fput(btp->bt_bdev_file); kmem_free(btp); } @@ -1994,7 +1994,7 @@ xfs_setsize_buftarg_early( struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct bdev_handle *bdev_handle) + struct file *bdev_file) { xfs_buftarg_t *btp; const struct dax_holder_operations *ops = NULL; @@ -2005,9 +2005,9 @@ xfs_alloc_buftarg( btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; - btp->bt_bdev_handle = bdev_handle; - btp->bt_dev = bdev_handle->bdev->bd_dev; - btp->bt_bdev = bdev_handle->bdev; + btp->bt_bdev_file = bdev_file; + btp->bt_bdev = file_bdev(bdev_file); + btp->bt_dev = btp->bt_bdev->bd_dev; btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b470de08a46c..304e858d04fb 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -98,7 +98,7 @@ typedef unsigned int xfs_buf_flags_t; */ typedef struct xfs_buftarg { dev_t bt_dev; - struct bdev_handle *bt_bdev_handle; + struct file *bt_bdev_file; struct block_device *bt_bdev; struct dax_device *bt_daxdev; u64 bt_dax_part_off; @@ -366,7 +366,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) * Handling of buftargs. */ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, - struct bdev_handle *bdev_handle); + struct file *bdev_file); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index aabb25dc3efa..57fa21ad7912 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -62,7 +62,7 @@ xfs_uuid_mount( int hole, i; /* Publish UUID in struct super_block */ - uuid_copy(&mp->m_super->s_uuid, uuid); + super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid)); if (xfs_has_nouuid(mp)) return 0; @@ -706,6 +706,8 @@ xfs_mountfs( /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; + super_set_sysfs_name_id(mp->m_super); + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_super->s_id); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aff20ddd4a9f..00fbd5b6e582 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -350,7 +350,6 @@ xfs_setup_dax_always( return -EINVAL; } - xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); return 0; disable_dax: @@ -362,16 +361,16 @@ STATIC int xfs_blkdev_get( xfs_mount_t *mp, const char *name, - struct bdev_handle **handlep) + struct file **bdev_filep) { int error = 0; - *handlep = bdev_open_by_path(name, + *bdev_filep = bdev_file_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, mp->m_super, &fs_holder_ops); - if (IS_ERR(*handlep)) { - error = PTR_ERR(*handlep); - *handlep = NULL; + if (IS_ERR(*bdev_filep)) { + error = PTR_ERR(*bdev_filep); + *bdev_filep = NULL; xfs_warn(mp, "Invalid device [%s], error=%d", name, error); } @@ -436,26 +435,26 @@ xfs_open_devices( { struct super_block *sb = mp->m_super; struct block_device *ddev = sb->s_bdev; - struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL; + struct file *logdev_file = NULL, *rtdev_file = NULL; int error; /* * Open real time and log devices - order is important. */ if (mp->m_logname) { - error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle); + error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file); if (error) return error; } if (mp->m_rtname) { - error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle); + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file); if (error) goto out_close_logdev; - if (rtdev_handle->bdev == ddev || - (logdev_handle && - rtdev_handle->bdev == logdev_handle->bdev)) { + if (file_bdev(rtdev_file) == ddev || + (logdev_file && + file_bdev(rtdev_file) == file_bdev(logdev_file))) { xfs_warn(mp, "Cannot mount filesystem with identical rtdev and ddev/logdev."); error = -EINVAL; @@ -467,25 +466,25 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file); if (!mp->m_ddev_targp) goto out_close_rtdev; - if (rtdev_handle) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle); + if (rtdev_file) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } - if (logdev_handle && logdev_handle->bdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle); + if (logdev_file && file_bdev(logdev_file) != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ - if (logdev_handle) - bdev_release(logdev_handle); + if (logdev_file) + fput(logdev_file); } return 0; @@ -496,11 +495,11 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - if (rtdev_handle) - bdev_release(rtdev_handle); + if (rtdev_file) + fput(rtdev_file); out_close_logdev: - if (logdev_handle) - bdev_release(logdev_handle); + if (logdev_file) + fput(logdev_file); return error; } @@ -1496,6 +1495,18 @@ xfs_fs_fill_super( mp->m_super = sb; + /* + * Copy VFS mount flags from the context now that all parameter parsing + * is guaranteed to have been completed by either the old mount API or + * the newer fsopen/fsconfig API. + */ + if (fc->sb_flags & SB_RDONLY) + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + if (fc->sb_flags & SB_DIRSYNC) + mp->m_features |= XFS_FEAT_DIRSYNC; + if (fc->sb_flags & SB_SYNCHRONOUS) + mp->m_features |= XFS_FEAT_WSYNC; + error = xfs_fs_validate_params(mp); if (error) return error; @@ -1965,6 +1976,11 @@ static const struct fs_context_operations xfs_context_ops = { .free = xfs_fs_free, }; +/* + * WARNING: do not initialise any parameters in this function that depend on + * mount option parsing having already been performed as this can be called from + * fsopen() before any parameters have been set. + */ static int xfs_init_fs_context( struct fs_context *fc) { @@ -1996,16 +2012,6 @@ static int xfs_init_fs_context( mp->m_logbsize = -1; mp->m_allocsize_log = 16; /* 64k */ - /* - * Copy binary VFS mount flags we are interested in. - */ - if (fc->sb_flags & SB_RDONLY) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - if (fc->sb_flags & SB_DIRSYNC) - mp->m_features |= XFS_FEAT_DIRSYNC; - if (fc->sb_flags & SB_SYNCHRONOUS) - mp->m_features |= XFS_FEAT_WSYNC; - fc->s_fs_info = mp; fc->ops = &xfs_context_ops; diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 6ab2318a9c8e..3b103715acc9 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac) * which implies that the page range can only be within the fixed inode size. */ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, + unsigned int len) { struct zonefs_zone *z = zonefs_inode_zone(inode); @@ -348,7 +349,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, struct zonefs_inode_info *zi = ZONEFS_I(inode); if (error) { - zonefs_io_error(inode, true); + /* + * For Sync IOs, error recovery is called from + * zonefs_file_dio_write(). + */ + if (!is_sync_kiocb(iocb)) + zonefs_io_error(inode, true); return error; } @@ -491,6 +497,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ret = -EINVAL; goto inode_unlock; } + /* + * Advance the zone write pointer offset. This assumes that the + * IO will succeed, which is OK to do because we do not allow + * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO + * fails, the error path will correct the write pointer offset. + */ + z->z_wpoffset += count; + zonefs_inode_account_active(inode); mutex_unlock(&zi->i_truncate_mutex); } @@ -504,20 +518,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) if (ret == -ENOTBLK) ret = -EBUSY; - if (zonefs_zone_is_seq(z) && - (ret > 0 || ret == -EIOCBQUEUED)) { - if (ret > 0) - count = ret; - - /* - * Update the zone write pointer offset assuming the write - * operation succeeded. If it did not, the error recovery path - * will correct it. Also do active seq file accounting. - */ - mutex_lock(&zi->i_truncate_mutex); - z->z_wpoffset += count; - zonefs_inode_account_active(inode); - mutex_unlock(&zi->i_truncate_mutex); + /* + * For a failed IO or partial completion, trigger error recovery + * to update the zone write pointer offset to a correct value. + * For asynchronous IOs, zonefs_file_write_dio_end_io() may already + * have executed error recovery if the IO already completed when we + * reach here. However, we cannot know that and execute error recovery + * again (that will not change anything). + */ + if (zonefs_zone_is_seq(z)) { + if (ret > 0 && ret != count) + ret = -EIO; + if (ret < 0 && ret != -EIOCBQUEUED) + zonefs_io_error(inode, true); } inode_unlock: diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 93971742613a..aadad16738df 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb, trace_zonefs_zone_mgmt(sb, z, op); ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, - z->z_size >> SECTOR_SHIFT, GFP_NOFS); + z->z_size >> SECTOR_SHIFT); if (ret) { zonefs_err(sb, "Zone management operation %s at %llu failed %d\n", @@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode) z->z_mode = inode->i_mode; } -struct zonefs_ioerr_data { - struct inode *inode; - bool write; -}; - static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, void *data) { - struct zonefs_ioerr_data *err = data; - struct inode *inode = err->inode; + struct blk_zone *z = data; + + *z = *zone; + return 0; +} + +static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, + bool write) +{ struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); @@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, data_size = zonefs_check_zone_condition(sb, z, zone); isize = i_size_read(inode); if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && - !err->write && isize == data_size) - return 0; + !write && isize == data_size) + return; /* * At this point, we detected either a bad zone or an inconsistency @@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * In all cases, warn about inode size inconsistency and handle the * IO error according to the zone condition and to the mount options. */ - if (zonefs_zone_is_seq(z) && isize != data_size) + if (isize != data_size) zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); @@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, zonefs_i_size_write(inode, data_size); z->z_wpoffset = data_size; zonefs_inode_account_active(inode); - - return 0; } /* @@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; - struct zonefs_sb_info *sbi = ZONEFS_SB(sb); unsigned int noio_flag; - unsigned int nr_zones = 1; - struct zonefs_ioerr_data err = { - .inode = inode, - .write = write, - }; + struct blk_zone zone; int ret; /* - * The only files that have more than one zone are conventional zone - * files with aggregated conventional zones, for which the inode zone - * size is always larger than the device zone size. + * Conventional zone have no write pointer and cannot become read-only + * or offline. So simply fake a report for a single or aggregated zone + * and let zonefs_handle_io_error() correct the zone inode information + * according to the mount options. */ - if (z->z_size > bdev_zone_sectors(sb->s_bdev)) - nr_zones = z->z_size >> - (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + if (!zonefs_zone_is_seq(z)) { + zone.start = z->z_sector; + zone.len = z->z_size >> SECTOR_SHIFT; + zone.wp = zone.start + zone.len; + zone.type = BLK_ZONE_TYPE_CONVENTIONAL; + zone.cond = BLK_ZONE_COND_NOT_WP; + zone.capacity = zone.len; + goto handle_io_error; + } /* * Memory allocations in blkdev_report_zones() can trigger a memory @@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write) * the GFP_NOIO context avoids both problems. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones, - zonefs_io_error_cb, &err); - if (ret != nr_zones) + ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1, + zonefs_io_error_cb, &zone); + memalloc_noio_restore(noio_flag); + + if (ret != 1) { zonefs_err(sb, "Get inode %lu zone information failed %d\n", inode->i_ino, ret); - memalloc_noio_restore(noio_flag); + zonefs_warn(sb, "remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + return; + } + +handle_io_error: + zonefs_handle_io_error(inode, &zone, write); } static struct kmem_cache *zonefs_inode_cachep; |