From 723012cab779eee8228376754e22c6594229bf8f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:44 +0000 Subject: ubifs: Set page uptodate in the correct place Page cache reads are lockless, so setting the freshly allocated page uptodate before we've overwritten it with the data it's supposed to have in it will allow a simultaneous reader to see old data. Move the call to SetPageUptodate into ubifs_write_end(), which is after we copied the new data into the page. Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Cc: stable@vger.kernel.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5029eb3390a5..d0694b83dd02 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -261,9 +261,6 @@ static int write_begin_slow(struct address_space *mapping, return err; } } - - SetPageUptodate(page); - ClearPageError(page); } if (PagePrivate(page)) @@ -463,9 +460,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, return err; } } - - SetPageUptodate(page); - ClearPageError(page); } err = allocate_budget(c, page, ui, appending); @@ -475,10 +469,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * If we skipped reading the page because we were going to * write all of it, then it is not up to date. */ - if (skipped_read) { + if (skipped_read) ClearPageChecked(page); - ClearPageUptodate(page); - } /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the @@ -569,6 +561,9 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, goto out; } + if (len == PAGE_SIZE) + SetPageUptodate(page); + if (!PagePrivate(page)) { attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); -- cgit v1.2.3 From 0df030d082d5b226984733b2e7386fa9760a7ca1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:45 +0000 Subject: ubifs: Convert from writepage to writepages This is a simplistic conversion to separate out any effects of no longer having a writepage method. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index d0694b83dd02..2022a31006df 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1001,8 +1001,10 @@ static int do_writepage(struct page *page, int len) * on the page lock and it would not write the truncated inode node to the * journal before we have finished. */ -static int ubifs_writepage(struct page *page, struct writeback_control *wbc) +static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, + void *data) { + struct page *page = &folio->page; struct inode *inode = page->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); @@ -1074,6 +1076,12 @@ out_unlock: return err; } +static int ubifs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return write_cache_pages(mapping, wbc, ubifs_writepage, NULL); +} + /** * do_attr_changes - change inode attributes. * @inode: inode to change attributes for @@ -1643,7 +1651,7 @@ static int ubifs_symlink_getattr(struct mnt_idmap *idmap, const struct address_space_operations ubifs_file_address_operations = { .read_folio = ubifs_read_folio, - .writepage = ubifs_writepage, + .writepages = ubifs_writepages, .write_begin = ubifs_write_begin, .write_end = ubifs_write_end, .invalidate_folio = ubifs_invalidate_folio, -- cgit v1.2.3 From c35acef383f4a2f2cfc30a5d8d3b0d49a86a1f7f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:46 +0000 Subject: ubifs: Convert ubifs_writepage to use a folio We still pass the page down to do_writepage(), but ubifs_writepage() itself is now large folio safe. It also contains far fewer hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2022a31006df..a4e8bec6c03c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1004,21 +1004,18 @@ static int do_writepage(struct page *page, int len) static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); loff_t i_size = i_size_read(inode), synced_i_size; - pgoff_t end_index = i_size >> PAGE_SHIFT; - int err, len = i_size & (PAGE_SIZE - 1); - void *kaddr; + int err, len = folio_size(folio); dbg_gen("ino %lu, pg %lu, pg flags %#lx", - inode->i_ino, page->index, page->flags); - ubifs_assert(c, PagePrivate(page)); + inode->i_ino, folio->index, folio->flags); + ubifs_assert(c, folio->private != NULL); - /* Is the page fully outside @i_size? (truncate in progress) */ - if (page->index > end_index || (page->index == end_index && !len)) { + /* Is the folio fully outside @i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { err = 0; goto out_unlock; } @@ -1027,9 +1024,9 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, synced_i_size = ui->synced_i_size; spin_unlock(&ui->ui_lock); - /* Is the page fully inside @i_size? */ - if (page->index < end_index) { - if (page->index >= synced_i_size >> PAGE_SHIFT) { + /* Is the folio fully inside i_size? */ + if (folio_pos(folio) + len <= i_size) { + if (folio_pos(folio) >= synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_redirty; @@ -1042,20 +1039,18 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, * with this. */ } - return do_writepage(page, PAGE_SIZE); + return do_writepage(&folio->page, len); } /* - * The page straddles @i_size. It must be zeroed out on each and every + * The folio straddles @i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - kaddr = kmap_atomic(page); - memset(kaddr + len, 0, PAGE_SIZE - len); - flush_dcache_page(page); - kunmap_atomic(kaddr); + len = i_size - folio_pos(folio); + folio_zero_segment(folio, len, folio_size(folio)); if (i_size > synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); @@ -1063,16 +1058,16 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, goto out_redirty; } - return do_writepage(page, len); + return do_writepage(&folio->page, len); out_redirty: /* - * redirty_page_for_writepage() won't call ubifs_dirty_inode() because + * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because * it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so * there is no need to do space budget for dirty inode. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); out_unlock: - unlock_page(page); + folio_unlock(folio); return err; } -- cgit v1.2.3 From 783d074167711d8109131b66dd0b70743bfc2918 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:47 +0000 Subject: ubifs: Use a folio in do_truncation() Convert from the old page APIs to the new folio APIs which saves a few hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index a4e8bec6c03c..7039c9006f24 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1153,11 +1153,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, if (offset) { pgoff_t index = new_size >> PAGE_SHIFT; - struct page *page; + struct folio *folio; - page = find_lock_page(inode->i_mapping, index); - if (page) { - if (PageDirty(page)) { + folio = filemap_lock_folio(inode->i_mapping, index); + if (!IS_ERR(folio)) { + if (folio_test_dirty(folio)) { /* * 'ubifs_jnl_truncate()' will try to truncate * the last data node, but it contains @@ -1166,14 +1166,14 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, * 'ubifs_jnl_truncate()' will see an already * truncated (and up to date) data node. */ - ubifs_assert(c, PagePrivate(page)); + ubifs_assert(c, folio->private != NULL); - clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); if (UBIFS_BLOCKS_PER_PAGE_SHIFT) - offset = new_size & - (PAGE_SIZE - 1); - err = do_writepage(page, offset); - put_page(page); + offset = offset_in_folio(folio, + new_size); + err = do_writepage(&folio->page, offset); + folio_put(folio); if (err) goto out_budg; /* @@ -1186,8 +1186,8 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, * to 'ubifs_jnl_truncate()' to save it from * having to read it. */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } } -- cgit v1.2.3 From 0c2d140c1f73f610c7e3f71888e1a50d770117e0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:48 +0000 Subject: ubifs: Convert do_writepage() to take a folio Replace the call to SetPageError() with a call to mapping_set_error(). Support large folios by using kmap_local_folio() and remapping each time we cross a page boundary. Saves a lot of hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 56 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 7039c9006f24..70d391b72897 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -898,60 +898,64 @@ static int ubifs_read_folio(struct file *file, struct folio *folio) return 0; } -static int do_writepage(struct page *page, int len) +static int do_writepage(struct folio *folio, size_t len) { - int err = 0, i, blen; + int err = 0, blen; unsigned int block; void *addr; + size_t offset = 0; union ubifs_key key; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; #ifdef UBIFS_DEBUG struct ubifs_inode *ui = ubifs_inode(inode); spin_lock(&ui->ui_lock); - ubifs_assert(c, page->index <= ui->synced_i_size >> PAGE_SHIFT); + ubifs_assert(c, folio->index <= ui->synced_i_size >> PAGE_SHIFT); spin_unlock(&ui->ui_lock); #endif - /* Update radix tree tags */ - set_page_writeback(page); + folio_start_writeback(folio); - addr = kmap(page); - block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; - i = 0; - while (len) { - blen = min_t(int, len, UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, offset); + block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + for (;;) { + blen = min_t(size_t, len, UBIFS_BLOCK_SIZE); data_key_init(c, &key, inode->i_ino, block); err = ubifs_jnl_write_data(c, inode, &key, addr, blen); if (err) break; - if (++i >= UBIFS_BLOCKS_PER_PAGE) + len -= blen; + if (!len) break; block += 1; addr += blen; - len -= blen; + if (folio_test_highmem(folio) && !offset_in_page(addr)) { + kunmap_local(addr - blen); + offset += PAGE_SIZE; + addr = kmap_local_folio(folio, offset); + } } + kunmap_local(addr); if (err) { - SetPageError(page); - ubifs_err(c, "cannot write page %lu of inode %lu, error %d", - page->index, inode->i_ino, err); + mapping_set_error(folio->mapping, err); + ubifs_err(c, "cannot write folio %lu of inode %lu, error %d", + folio->index, inode->i_ino, err); ubifs_ro_mode(c, err); } - ubifs_assert(c, PagePrivate(page)); - if (PageChecked(page)) + ubifs_assert(c, folio->private != NULL); + if (folio_test_checked(folio)) release_new_page_budget(c); else release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - detach_page_private(page); - ClearPageChecked(page); + folio_detach_private(folio); + folio_clear_checked(folio); - kunmap(page); - unlock_page(page); - end_page_writeback(page); + folio_unlock(folio); + folio_end_writeback(folio); return err; } @@ -1039,7 +1043,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, * with this. */ } - return do_writepage(&folio->page, len); + return do_writepage(folio, len); } /* @@ -1058,7 +1062,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, goto out_redirty; } - return do_writepage(&folio->page, len); + return do_writepage(folio, len); out_redirty: /* * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because @@ -1172,7 +1176,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, if (UBIFS_BLOCKS_PER_PAGE_SHIFT) offset = offset_in_folio(folio, new_size); - err = do_writepage(&folio->page, offset); + err = do_writepage(folio, offset); folio_put(folio); if (err) goto out_budg; -- cgit v1.2.3 From 85ffbf555794c2484b9ab440d3bc23638a0cb315 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:49 +0000 Subject: ubifs: Convert ubifs_vm_page_mkwrite() to use a folio Replace six implicit calls to compound_head() with one. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- Documentation/mm/page_cache.rst | 10 ++++++++++ fs/ubifs/file.c | 36 ++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/Documentation/mm/page_cache.rst b/Documentation/mm/page_cache.rst index 75eba7c431b2..138d61f869df 100644 --- a/Documentation/mm/page_cache.rst +++ b/Documentation/mm/page_cache.rst @@ -3,3 +3,13 @@ ========== Page Cache ========== + +The page cache is the primary way that the user and the rest of the kernel +interact with filesystems. It can be bypassed (e.g. with O_DIRECT), +but normal reads, writes and mmaps go through the page cache. + +Folios +====== + +The folio is the unit of memory management within the page cache. +Operations diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 70d391b72897..3c4a98476c23 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1514,14 +1514,14 @@ static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) */ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec64 now = current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; - dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, + dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, folio->index, i_size_read(inode)); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -1529,17 +1529,17 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* -EROFS */ /* - * We have not locked @page so far so we may budget for changing the - * page. Note, we cannot do this after we locked the page, because + * We have not locked @folio so far so we may budget for changing the + * folio. Note, we cannot do this after we locked the folio, because * budgeting may cause write-back which would cause deadlock. * - * At the moment we do not know whether the page is dirty or not, so we - * assume that it is not and budget for a new page. We could look at + * At the moment we do not know whether the folio is dirty or not, so we + * assume that it is not and budget for a new folio. We could look at * the @PG_private flag and figure this out, but we may race with write - * back and the page state may change by the time we lock it, so this + * back and the folio state may change by the time we lock it, so this * would need additional care. We do not bother with this at the * moment, although it might be good idea to do. Instead, we allocate - * budget for a new page and amend it later on if the page was in fact + * budget for a new folio and amend it later on if the folio was in fact * dirty. * * The budgeting-related logic of this function is similar to what we @@ -1562,21 +1562,21 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - lock_page(page); - if (unlikely(page->mapping != inode->i_mapping || - page_offset(page) > i_size_read(inode))) { - /* Page got truncated out from underneath us */ + folio_lock(folio); + if (unlikely(folio->mapping != inode->i_mapping || + folio_pos(folio) >= i_size_read(inode))) { + /* Folio got truncated out from underneath us */ goto sigbus; } - if (PagePrivate(page)) + if (folio->private) release_new_page_budget(c); else { - if (!PageChecked(page)) + if (!folio_test_checked(folio)) ubifs_convert_page_budget(c); - attach_page_private(page, (void *)1); + folio_attach_private(folio, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(folio->mapping, folio); } if (update_time) { @@ -1592,11 +1592,11 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) ubifs_release_dirty_inode_budget(c, ui); } - wait_for_stable_page(page); + folio_wait_stable(folio); return VM_FAULT_LOCKED; sigbus: - unlock_page(page); + folio_unlock(folio); ubifs_release_budget(c, &req); return VM_FAULT_SIGBUS; } -- cgit v1.2.3 From 2ec718435abb42901c83fbf62fc4d638d4078c27 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:50 +0000 Subject: ubifs: Convert write_begin_slow() to use a folio Update to new APIs, removing several calls to compound_head() and including support for large folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 3c4a98476c23..cc4d2ec95b70 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -222,16 +222,16 @@ static int write_begin_slow(struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; struct ubifs_budget_req req = { .new_page = 1 }; int err, appending = !!(pos + len > inode->i_size); - struct page *page; + struct folio *folio; dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", inode->i_ino, pos, len, inode->i_size); /* - * At the slow path we have to budget before locking the page, because - * budgeting may force write-back, which would wait on locked pages and - * deadlock if we had the page locked. At this point we do not know - * anything about the page, so assume that this is a new page which is + * At the slow path we have to budget before locking the folio, because + * budgeting may force write-back, which would wait on locked folios and + * deadlock if we had the folio locked. At this point we do not know + * anything about the folio, so assume that this is a new folio which is * written to a hole. This corresponds to largest budget. Later the * budget will be amended if this is not true. */ @@ -243,42 +243,43 @@ static int write_begin_slow(struct address_space *mapping, if (unlikely(err)) return err; - page = grab_cache_page_write_begin(mapping, index); - if (unlikely(!page)) { + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { ubifs_release_budget(c, &req); - return -ENOMEM; + return PTR_ERR(folio); } - if (!PageUptodate(page)) { - if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) - SetPageChecked(page); + if (!folio_test_uptodate(folio)) { + if (pos == folio_pos(folio) && len >= folio_size(folio)) + folio_set_checked(folio); else { - err = do_readpage(page); + err = do_readpage(&folio->page); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ubifs_release_budget(c, &req); return err; } } } - if (PagePrivate(page)) + if (folio->private) /* - * The page is dirty, which means it was budgeted twice: + * The folio is dirty, which means it was budgeted twice: * o first time the budget was allocated by the task which - * made the page dirty and set the PG_private flag; + * made the folio dirty and set the private field; * o and then we budgeted for it for the second time at the * very beginning of this function. * - * So what we have to do is to release the page budget we + * So what we have to do is to release the folio budget we * allocated. */ release_new_page_budget(c); - else if (!PageChecked(page)) + else if (!folio_test_checked(folio)) /* - * We are changing a page which already exists on the media. - * This means that changing the page does not make the amount + * We are changing a folio which already exists on the media. + * This means that changing the folio does not make the amount * of indexing information larger, and this part of the budget * which we have already acquired may be released. */ @@ -301,7 +302,7 @@ static int write_begin_slow(struct address_space *mapping, ubifs_release_dirty_inode_budget(c, ui); } - *pagep = page; + *pagep = &folio->page; return 0; } -- cgit v1.2.3 From f60d356e6c5f966af5ab86701c93eb2028a66b31 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:51 +0000 Subject: ubifs: Convert ubifs_write_begin() to use a folio Save eight calls to compound_head() by using the new folio API. Remove a few assumptions that would break with large folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index cc4d2ec95b70..13c078bdf156 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -426,7 +426,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; int err, appending = !!(pos + len > inode->i_size); int skipped_read = 0; - struct page *page; + struct folio *folio; ubifs_assert(c, ubifs_inode(inode)->ui_size == inode->i_size); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -435,13 +435,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, return -EROFS; /* Try out the fast-path part first */ - page = grab_cache_page_write_begin(mapping, index); - if (unlikely(!page)) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* The page is not loaded from the flash */ - if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) { + if (pos == folio_pos(folio) && len >= folio_size(folio)) { /* * We change whole page so no need to load it. But we * do not know whether this page exists on the media or @@ -451,19 +452,19 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * media. Thus, we are setting the @PG_checked flag * here. */ - SetPageChecked(page); + folio_set_checked(folio); skipped_read = 1; } else { - err = do_readpage(page); + err = do_readpage(&folio->page); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } } } - err = allocate_budget(c, page, ui, appending); + err = allocate_budget(c, &folio->page, ui, appending); if (unlikely(err)) { ubifs_assert(c, err == -ENOSPC); /* @@ -471,7 +472,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * write all of it, then it is not up to date. */ if (skipped_read) - ClearPageChecked(page); + folio_clear_checked(folio); /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the @@ -483,8 +484,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); mutex_unlock(&ui->ui_mutex); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return write_begin_slow(mapping, pos, len, pagep); } @@ -495,9 +496,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * with @ui->ui_mutex locked if we are appending pages, and unlocked * otherwise. This is an optimization (slightly hacky though). */ - *pagep = page; + *pagep = &folio->page; return 0; - } /** -- cgit v1.2.3 From ffdff813d5b1fa738b5433c5698108c69cd4b71f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:52 +0000 Subject: ubifs: Convert ubifs_write_end() to use a folio Convert the incoming page pointer to a folio and use it throughout, saving several calls to compound_head(). Also remove some PAGE_SIZE assumptions. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 13c078bdf156..feab95b59b05 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -530,6 +530,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -537,47 +538,47 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, int appending = !!(end_pos > inode->i_size); dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", - inode->i_ino, pos, page->index, len, copied, inode->i_size); + inode->i_ino, pos, folio->index, len, copied, inode->i_size); - if (unlikely(copied < len && len == PAGE_SIZE)) { + if (unlikely(copied < len && !folio_test_uptodate(folio))) { /* - * VFS copied less data to the page that it intended and + * VFS copied less data to the folio than it intended and * declared in its '->write_begin()' call via the @len - * argument. If the page was not up-to-date, and @len was - * @PAGE_SIZE, the 'ubifs_write_begin()' function did + * argument. If the folio was not up-to-date, + * the 'ubifs_write_begin()' function did * not load it from the media (for optimization reasons). This - * means that part of the page contains garbage. So read the - * page now. + * means that part of the folio contains garbage. So read the + * folio now. */ dbg_gen("copied %d instead of %d, read page and repeat", copied, len); - cancel_budget(c, page, ui, appending); - ClearPageChecked(page); + cancel_budget(c, &folio->page, ui, appending); + folio_clear_checked(folio); /* * Return 0 to force VFS to repeat the whole operation, or the * error code if 'do_readpage()' fails. */ - copied = do_readpage(page); + copied = do_readpage(&folio->page); goto out; } - if (len == PAGE_SIZE) - SetPageUptodate(page); + if (len == folio_size(folio)) + folio_mark_uptodate(folio); - if (!PagePrivate(page)) { - attach_page_private(page, (void *)1); + if (!folio->private) { + folio_attach_private(folio, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(mapping, folio); } if (appending) { i_size_write(inode, end_pos); ui->ui_size = end_pos; /* - * Note, we do not set @I_DIRTY_PAGES (which means that the - * inode has dirty pages), this has been done in - * '__set_page_dirty_nobuffers()'. + * We do not set @I_DIRTY_PAGES (which means that + * the inode has dirty pages), this was done in + * filemap_dirty_folio(). */ __mark_inode_dirty(inode, I_DIRTY_DATASYNC); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); @@ -585,8 +586,8 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } -- cgit v1.2.3 From b96af1fdb47cf85eaddc3c82413b580b75aae2d2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:53 +0000 Subject: ubifs: Convert do_readpage() to take a folio All the callers now have a folio, so pass it in, and convert do_readpage() to us folios directly. Includes unifying the exit paths from this function and using kmap_local instead of plain kmap. This function should now work with large folios, but this is not tested. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 64 +++++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index feab95b59b05..654af636b11d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -96,36 +96,36 @@ dump: return -EINVAL; } -static int do_readpage(struct page *page) +static int do_readpage(struct folio *folio) { void *addr; int err = 0, i; unsigned int block, beyond; - struct ubifs_data_node *dn; - struct inode *inode = page->mapping->host; + struct ubifs_data_node *dn = NULL; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; loff_t i_size = i_size_read(inode); dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, page->index, i_size, page->flags); - ubifs_assert(c, !PageChecked(page)); - ubifs_assert(c, !PagePrivate(page)); + inode->i_ino, folio->index, i_size, folio->flags); + ubifs_assert(c, !folio_test_checked(folio)); + ubifs_assert(c, !folio->private); - addr = kmap(page); + addr = kmap_local_folio(folio, 0); - block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; if (block >= beyond) { /* Reading beyond inode */ - SetPageChecked(page); - memset(addr, 0, PAGE_SIZE); + folio_set_checked(folio); + addr = folio_zero_tail(folio, 0, addr); goto out; } dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS); if (!dn) { err = -ENOMEM; - goto error; + goto out; } i = 0; @@ -150,39 +150,35 @@ static int do_readpage(struct page *page) memset(addr + ilen, 0, dlen - ilen); } } - if (++i >= UBIFS_BLOCKS_PER_PAGE) + if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio))) break; block += 1; addr += UBIFS_BLOCK_SIZE; + if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { + kunmap_local(addr - UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); + } } + if (err) { struct ubifs_info *c = inode->i_sb->s_fs_info; if (err == -ENOENT) { /* Not found, so it must be a hole */ - SetPageChecked(page); + folio_set_checked(folio); dbg_gen("hole"); - goto out_free; + err = 0; + } else { + ubifs_err(c, "cannot read page %lu of inode %lu, error %d", + folio->index, inode->i_ino, err); } - ubifs_err(c, "cannot read page %lu of inode %lu, error %d", - page->index, inode->i_ino, err); - goto error; } -out_free: - kfree(dn); out: - SetPageUptodate(page); - ClearPageError(page); - flush_dcache_page(page); - kunmap(page); - return 0; - -error: kfree(dn); - ClearPageUptodate(page); - SetPageError(page); - flush_dcache_page(page); - kunmap(page); + if (!err) + folio_mark_uptodate(folio); + flush_dcache_folio(folio); + kunmap_local(addr); return err; } @@ -254,7 +250,7 @@ static int write_begin_slow(struct address_space *mapping, if (pos == folio_pos(folio) && len >= folio_size(folio)) folio_set_checked(folio); else { - err = do_readpage(&folio->page); + err = do_readpage(folio); if (err) { folio_unlock(folio); folio_put(folio); @@ -455,7 +451,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, folio_set_checked(folio); skipped_read = 1; } else { - err = do_readpage(&folio->page); + err = do_readpage(folio); if (err) { folio_unlock(folio); folio_put(folio); @@ -559,7 +555,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, * Return 0 to force VFS to repeat the whole operation, or the * error code if 'do_readpage()' fails. */ - copied = do_readpage(&folio->page); + copied = do_readpage(folio); goto out; } @@ -895,7 +891,7 @@ static int ubifs_read_folio(struct file *file, struct folio *folio) if (ubifs_bulk_read(page)) return 0; - do_readpage(page); + do_readpage(folio); folio_unlock(folio); return 0; } -- cgit v1.2.3 From a3c2f196cdfc21404e9678204f55ad7be8e7226e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:54 +0000 Subject: ubifs: Convert allocate_budget() to work on a folio The one caller has a folio, so pass it in instead of the page. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 654af636b11d..5542d73db717 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -305,7 +305,7 @@ static int write_begin_slow(struct address_space *mapping, /** * allocate_budget - allocate budget for 'ubifs_write_begin()'. * @c: UBIFS file-system description object - * @page: page to allocate budget for + * @folio: folio to allocate budget for * @ui: UBIFS inode object the page belongs to * @appending: non-zero if the page is appended * @@ -316,15 +316,15 @@ static int write_begin_slow(struct address_space *mapping, * * Returns: %0 in case of success and %-ENOSPC in case of failure. */ -static int allocate_budget(struct ubifs_info *c, struct page *page, +static int allocate_budget(struct ubifs_info *c, struct folio *folio, struct ubifs_inode *ui, int appending) { struct ubifs_budget_req req = { .fast = 1 }; - if (PagePrivate(page)) { + if (folio->private) { if (!appending) /* - * The page is dirty and we are not appending, which + * The folio is dirty and we are not appending, which * means no budget is needed at all. */ return 0; @@ -348,11 +348,11 @@ static int allocate_budget(struct ubifs_info *c, struct page *page, */ req.dirtied_ino = 1; } else { - if (PageChecked(page)) + if (folio_test_checked(folio)) /* * The page corresponds to a hole and does not * exist on the media. So changing it makes - * make the amount of indexing information + * the amount of indexing information * larger, and we have to budget for a new * page. */ @@ -460,7 +460,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, } } - err = allocate_budget(c, &folio->page, ui, appending); + err = allocate_budget(c, folio, ui, appending); if (unlikely(err)) { ubifs_assert(c, err == -ENOSPC); /* -- cgit v1.2.3 From 45d76698d119fffcd4d39d1cfcbfe30a87a7df77 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:55 +0000 Subject: ubifs: Convert cancel_budget() to take a folio The one caller already has a folio, so pass it in instead of the page. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5542d73db717..9cd2e6d37c5d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -499,14 +499,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, /** * cancel_budget - cancel budget. * @c: UBIFS file-system description object - * @page: page to cancel budget for + * @folio: folio to cancel budget for * @ui: UBIFS inode object the page belongs to * @appending: non-zero if the page is appended * * This is a helper function for a page write operation. It unlocks the * @ui->ui_mutex in case of appending. */ -static void cancel_budget(struct ubifs_info *c, struct page *page, +static void cancel_budget(struct ubifs_info *c, struct folio *folio, struct ubifs_inode *ui, int appending) { if (appending) { @@ -514,8 +514,8 @@ static void cancel_budget(struct ubifs_info *c, struct page *page, ubifs_release_dirty_inode_budget(c, ui); mutex_unlock(&ui->ui_mutex); } - if (!PagePrivate(page)) { - if (PageChecked(page)) + if (!folio->private) { + if (folio_test_checked(folio)) release_new_page_budget(c); else release_existing_page_budget(c); @@ -548,7 +548,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, */ dbg_gen("copied %d instead of %d, read page and repeat", copied, len); - cancel_budget(c, &folio->page, ui, appending); + cancel_budget(c, folio, ui, appending); folio_clear_checked(folio); /* -- cgit v1.2.3 From 7f348f8ce51c963d99a66d00ab73717f35dd27e1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:56 +0000 Subject: ubifs: Pass a folio into ubifs_bulk_read() and ubifs_do_bulk_read() This saves a single call to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 9cd2e6d37c5d..f481fca7f53d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -703,15 +703,15 @@ out_err: * ubifs_do_bulk_read - do bulk-read. * @c: UBIFS file-system description object * @bu: bulk-read information - * @page1: first page to read + * @folio1: first folio to read * * Returns: %1 if the bulk-read is done, otherwise %0 is returned. */ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, - struct page *page1) + struct folio *folio1) { - pgoff_t offset = page1->index, end_index; - struct address_space *mapping = page1->mapping; + pgoff_t offset = folio1->index, end_index; + struct address_space *mapping = folio1->mapping; struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); int err, page_idx, page_cnt, ret = 0, n = 0; @@ -761,11 +761,11 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, goto out_warn; } - err = populate_page(c, page1, bu, &n); + err = populate_page(c, &folio1->page, bu, &n); if (err) goto out_warn; - unlock_page(page1); + folio_unlock(folio1); ret = 1; isize = i_size_read(inode); @@ -810,7 +810,7 @@ out_bu_off: /** * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. - * @page: page from which to start bulk-read. + * @folio: folio from which to start bulk-read. * * Some flash media are capable of reading sequentially at faster rates. UBIFS * bulk-read facility is designed to take advantage of that, by reading in one @@ -819,12 +819,12 @@ out_bu_off: * * Returns: %1 if a bulk-read is done and %0 otherwise. */ -static int ubifs_bulk_read(struct page *page) +static int ubifs_bulk_read(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); - pgoff_t index = page->index, last_page_read = ui->last_page_read; + pgoff_t index = folio->index, last_page_read = ui->last_page_read; struct bu_info *bu; int err = 0, allocated = 0; @@ -872,8 +872,8 @@ static int ubifs_bulk_read(struct page *page) bu->buf_len = c->max_bu_buf_len; data_key_init(c, &bu->key, inode->i_ino, - page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); - err = ubifs_do_bulk_read(c, bu, page); + folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); + err = ubifs_do_bulk_read(c, bu, folio); if (!allocated) mutex_unlock(&c->bu_mutex); @@ -887,9 +887,7 @@ out_unlock: static int ubifs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - - if (ubifs_bulk_read(page)) + if (ubifs_bulk_read(folio)) return 0; do_readpage(folio); folio_unlock(folio); -- cgit v1.2.3 From d06192731c335cb7b62ad0dd211de26e03d56f3f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:57 +0000 Subject: ubifs: Use a folio in ubifs_do_bulk_read() When looking in the page cache, retrieve a folio instead of a page. This would need some work to make it safe for large folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f481fca7f53d..11bf9f8cca6c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -775,19 +775,19 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, for (page_idx = 1; page_idx < page_cnt; page_idx++) { pgoff_t page_offset = offset + page_idx; - struct page *page; + struct folio *folio; if (page_offset > end_index) break; - page = pagecache_get_page(mapping, page_offset, + folio = __filemap_get_folio(mapping, page_offset, FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT, ra_gfp_mask); - if (!page) + if (IS_ERR(folio)) break; - if (!PageUptodate(page)) - err = populate_page(c, page, bu, &n); - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) + err = populate_page(c, &folio->page, bu, &n); + folio_unlock(folio); + folio_put(folio); if (err) break; } -- cgit v1.2.3 From a16bfab367c60f1a66be03676a374451dc022b37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:58 +0000 Subject: ubifs: Convert populate_page() to take a folio Both callers now have a folio, so pass it in. This function contains several assumptions that folios are not large. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 11bf9f8cca6c..a1f46919934c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -590,35 +590,35 @@ out: /** * populate_page - copy data nodes into a page for bulk-read. * @c: UBIFS file-system description object - * @page: page + * @folio: folio * @bu: bulk-read information * @n: next zbranch slot * * Returns: %0 on success and a negative error code on failure. */ -static int populate_page(struct ubifs_info *c, struct page *page, +static int populate_page(struct ubifs_info *c, struct folio *folio, struct bu_info *bu, int *n) { int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; loff_t i_size = i_size_read(inode); unsigned int page_block; void *addr, *zaddr; pgoff_t end_index; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, page->index, i_size, page->flags); + inode->i_ino, folio->index, i_size, folio->flags); - addr = zaddr = kmap(page); + addr = zaddr = kmap_local_folio(folio, 0); end_index = (i_size - 1) >> PAGE_SHIFT; - if (!i_size || page->index > end_index) { + if (!i_size || folio->index > end_index) { hole = 1; - memset(addr, 0, PAGE_SIZE); + addr = folio_zero_tail(folio, 0, addr); goto out_hole; } - page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + page_block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; while (1) { int err, len, out_len, dlen; @@ -667,9 +667,13 @@ static int populate_page(struct ubifs_info *c, struct page *page, break; addr += UBIFS_BLOCK_SIZE; page_block += 1; + if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { + kunmap_local(addr - UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); + } } - if (end_index == page->index) { + if (end_index == folio->index) { int len = i_size & (PAGE_SIZE - 1); if (len && len < read) @@ -678,22 +682,19 @@ static int populate_page(struct ubifs_info *c, struct page *page, out_hole: if (hole) { - SetPageChecked(page); + folio_set_checked(folio); dbg_gen("hole"); } - SetPageUptodate(page); - ClearPageError(page); - flush_dcache_page(page); - kunmap(page); + folio_mark_uptodate(folio); + flush_dcache_folio(folio); + kunmap_local(addr); *n = nn; return 0; out_err: - ClearPageUptodate(page); - SetPageError(page); - flush_dcache_page(page); - kunmap(page); + flush_dcache_folio(folio); + kunmap_local(addr); ubifs_err(c, "bad data node (block %u, inode %lu)", page_block, inode->i_ino); return -EINVAL; @@ -761,7 +762,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, goto out_warn; } - err = populate_page(c, &folio1->page, bu, &n); + err = populate_page(c, folio1, bu, &n); if (err) goto out_warn; @@ -785,7 +786,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, if (IS_ERR(folio)) break; if (!folio_test_uptodate(folio)) - err = populate_page(c, &folio->page, bu, &n); + err = populate_page(c, folio, bu, &n); folio_unlock(folio); folio_put(folio); if (err) -- cgit v1.2.3 From eb54235315f4577b035c89a10ca3eb48caab0445 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 24 Jan 2024 10:22:46 +0100 Subject: MAINTAINERS: Add Zhihao Cheng as UBI/UBIFS reviewer Recognizing Zhihao Cheng's valuable contributions, let's officially appoint him as a UBI/UBIFS reviewer. His demonstrated expertise and assistance make him a valuable addition to the MTD community. Cc: Zhihao Cheng Acked-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9ed4d3868539..94ff0f30130b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22471,6 +22471,7 @@ F: include/uapi/misc/uacce/ UBI FILE SYSTEM (UBIFS) M: Richard Weinberger +R: Zhihao Cheng L: linux-mtd@lists.infradead.org S: Supported W: http://www.linux-mtd.infradead.org/doc/ubifs.html @@ -22599,6 +22600,7 @@ F: drivers/ufs/host/ufs-renesas.c UNSORTED BLOCK IMAGES (UBI) M: Richard Weinberger +R: Zhihao Cheng L: linux-mtd@lists.infradead.org S: Supported W: http://www.linux-mtd.infradead.org/ -- cgit v1.2.3 From 68a24aba7c593eafa8fd00f2f76407b9b32b47a9 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 24 Jan 2024 07:37:02 +0100 Subject: ubi: Check for too small LEB size in VTBL code If the LEB size is smaller than a volume table record we cannot have volumes. In this case abort attaching. Cc: Chenyuan Yang Cc: stable@vger.kernel.org Fixes: 801c135ce73d ("UBI: Unsorted Block Images") Reported-by: Chenyuan Yang Closes: https://lore.kernel.org/linux-mtd/1433EB7A-FC89-47D6-8F47-23BE41B263B3@illinois.edu/ Signed-off-by: Richard Weinberger Reviewed-by: Zhihao Cheng --- drivers/mtd/ubi/vtbl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c index f700f0e4f2ec..6e5489e233dd 100644 --- a/drivers/mtd/ubi/vtbl.c +++ b/drivers/mtd/ubi/vtbl.c @@ -791,6 +791,12 @@ int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_attach_info *ai) * The number of supported volumes is limited by the eraseblock size * and by the UBI_MAX_VOLUMES constant. */ + + if (ubi->leb_size < UBI_VTBL_RECORD_SIZE) { + ubi_err(ubi, "LEB size too small for a volume record"); + return -EINVAL; + } + ubi->vtbl_slots = ubi->leb_size / UBI_VTBL_RECORD_SIZE; if (ubi->vtbl_slots > UBI_MAX_VOLUMES) ubi->vtbl_slots = UBI_MAX_VOLUMES; -- cgit v1.2.3 From 60f16e912a53ad78306ca7cf0c95913293042411 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Feb 2024 10:54:07 +0100 Subject: ubifs: fix sort function prototype The global sort() function expects a callback pointer to a function with two void* arguments, but ubifs has a function with specific object types, which causes a warning in clang-16 and higher: fs/ubifs/lprops.c:1272:9: error: cast from 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, struct ubifs_lp_stats *)' to 'ubifs_lpt_scan_callback' (aka 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, void *)') converts to incompatible function type [-Werror,-Wcast-function-type-strict] 1272 | (ubifs_lpt_scan_callback)scan_check_cb, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Change the prototype to the regular one and cast the object pointers locally instead. Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Arnd Bergmann Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/find.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 873e6e1c92b5..1cb79b167a4f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -726,11 +726,10 @@ out: return err; } -static int cmp_dirty_idx(const struct ubifs_lprops **a, - const struct ubifs_lprops **b) +static int cmp_dirty_idx(const void *a, const void *b) { - const struct ubifs_lprops *lpa = *a; - const struct ubifs_lprops *lpb = *b; + const struct ubifs_lprops *lpa = *(const struct ubifs_lprops **)a; + const struct ubifs_lprops *lpb = *(const struct ubifs_lprops **)b; return lpa->dirty + lpa->free - lpb->dirty - lpb->free; } @@ -754,7 +753,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) sizeof(void *) * c->dirty_idx.cnt); /* Sort it so that the dirtiest is now at the end */ sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *), - (int (*)(const void *, const void *))cmp_dirty_idx, NULL); + cmp_dirty_idx, NULL); dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt); if (c->dirty_idx.cnt) dbg_find("dirtiest index LEB is %d with dirty %d and free %d", -- cgit v1.2.3 From ec724e534dfdd592abc5ac066be77ef15c455ccc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Feb 2024 10:54:08 +0100 Subject: ubifs: fix function pointer cast warnings ubifs has a number of callback functions for ubifs_lpt_scan_nolock() using two different prototypes, either passing a struct scan_data or a struct ubifs_lp_stats, but the caller expects a void pointer instead. clang-16 now warns about this: fs/ubifs/find.c:170:9: error: cast from 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, struct scan_data *)' to 'ubifs_lpt_scan_callback' (aka 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, void *)') converts to incompatible function type [-Werror,-Wcast-function-type-strict] 170 | (ubifs_lpt_scan_callback)scan_for_dirty_cb, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fs/ubifs/find.c:449:9: error: cast from 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, struct scan_data *)' to 'ubifs_lpt_scan_callback' (aka 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, void *)') converts to incompatible function type [-Werror,-Wcast-function-type-strict] 449 | (ubifs_lpt_scan_callback)scan_for_free_cb, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Change all of these callback functions to actually take the void * argument that is passed by their caller. Signed-off-by: Arnd Bergmann Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/find.c | 23 ++++++++++++----------- fs/ubifs/lprops.c | 6 +++--- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 1cb79b167a4f..6ebf3c04ac5f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -82,8 +82,9 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) */ static int scan_for_dirty_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -166,8 +167,7 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, data.pick_free = pick_free; data.lnum = -1; data.exclude_index = exclude_index; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_dirty_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_dirty_cb, &data); if (err) return ERR_PTR(err); @@ -349,8 +349,9 @@ out: */ static int scan_for_free_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -446,7 +447,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, data.pick_free = pick_free; data.lnum = -1; err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_free_cb, + scan_for_free_cb, &data); if (err) return ERR_PTR(err); @@ -589,8 +590,9 @@ out: */ static int scan_for_idx_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -625,8 +627,7 @@ static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) int err; data.lnum = -1; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_idx_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_idx_cb, &data); if (err) return ERR_PTR(err); @@ -781,8 +782,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) */ static int scan_dirty_idx_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -841,8 +843,7 @@ static int find_dirty_idx_leb(struct ubifs_info *c) if (c->pnodes_have >= c->pnode_cnt) /* All pnodes are in memory, so skip scan */ return -ENOSPC; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_dirty_idx_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_dirty_idx_cb, &data); if (err) return err; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 6d6cd85c2b4c..a11c3dab7e16 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1014,8 +1014,9 @@ out: */ static int scan_check_cb(struct ubifs_info *c, const struct ubifs_lprops *lp, int in_tree, - struct ubifs_lp_stats *lst) + void *arg) { + struct ubifs_lp_stats *lst = arg; struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; @@ -1269,8 +1270,7 @@ int dbg_check_lprops(struct ubifs_info *c) memset(&lst, 0, sizeof(struct ubifs_lp_stats)); err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, - (ubifs_lpt_scan_callback)scan_check_cb, - &lst); + scan_check_cb, &lst); if (err && err != -ENOSPC) goto out; -- cgit v1.2.3 From 788cd161f99638db81b4c9ce836dbdf9a65c7701 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Thu, 11 Jan 2024 14:36:56 +0800 Subject: ubifs: Remove unreachable code in dbg_check_ltab_lnum Because there is no break statement in the dead loop above,it is impossible to execute the 'err=0' statement.Delete the code that will never execute. Fixes: 6fb324a4b0c3 ("UBIFS: allocate ltab checking buffer on demand") Signed-off-by: Kunwu Chan Cc: Kunwu Chan Suggested-by: Richard Weinberger Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/lpt_commit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index c4d079328b92..07351fdce722 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1646,7 +1646,6 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) len -= node_len; } - err = 0; out: vfree(buf); return err; -- cgit v1.2.3 From 7f174ae4f39e8475adcc09d26c5a43394689ad6c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 20 Feb 2024 10:49:03 +0800 Subject: ubi: correct the calculation of fastmap size Now that the calculation of fastmap size in ubi_calc_fm_size() is incorrect since it miss each user volume's ubi_fm_eba structure and the Internal UBI volume info. Let's correct the calculation. Cc: stable@vger.kernel.org Signed-off-by: Zhang Yi Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/fastmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c index 2a728c31e6b8..9a4940874be5 100644 --- a/drivers/mtd/ubi/fastmap.c +++ b/drivers/mtd/ubi/fastmap.c @@ -85,9 +85,10 @@ size_t ubi_calc_fm_size(struct ubi_device *ubi) sizeof(struct ubi_fm_scan_pool) + sizeof(struct ubi_fm_scan_pool) + (ubi->peb_count * sizeof(struct ubi_fm_ec)) + - (sizeof(struct ubi_fm_eba) + - (ubi->peb_count * sizeof(__be32))) + - sizeof(struct ubi_fm_volhdr) * UBI_MAX_VOLUMES; + ((sizeof(struct ubi_fm_eba) + + sizeof(struct ubi_fm_volhdr)) * + (UBI_MAX_VOLUMES + UBI_INT_VOL_COUNT)) + + (ubi->peb_count * sizeof(__be32)); return roundup(size, ubi->leb_size); } -- cgit v1.2.3 From fbed4baed046a2815889810c396e333820b164b6 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Sat, 13 Jan 2024 21:06:00 +0800 Subject: ubi: fix slab-out-of-bounds in ubi_eba_get_ldesc+0xfb/0x130 When using the ioctl interface to resize a UBI volume, `ubi_resize_volume` resizes the EBA table first but does not change `vol->reserved_pebs` in the same atomic context, which may cause concurrent access to the EBA table. For example, when a user shrinks UBI volume A by calling `ubi_resize_volume`, while another thread is writing to volume B and triggering wear-leveling, which may call `ubi_write_fastmap`, under these circumstances, KASAN may report a slab-out-of-bounds error in `ubi_eba_get_ldesc+0xfb/0x130`. This patch fixes race conditions in `ubi_resize_volume` and `ubi_update_fastmap` to avoid out-of-bounds reads of `eba_tbl`. First, it ensures that updates to `eba_tbl` and `reserved_pebs` are protected by `vol->volumes_lock`. Second, it implements a rollback mechanism in case of resize failure. It is also worth mentioning that for volume shrinkage failures, since part of the volume has already been shrunk and unmapped, there is no need to recover `{rsvd/avail}_pebs`. ================================================================== BUG: KASAN: slab-out-of-bounds in ubi_eba_get_ldesc+0xfb/0x130 [ubi] Read of size 4 at addr ffff88800f43f570 by task kworker/u16:0/7 CPU: 0 PID: 7 Comm: kworker/u16:0 Not tainted 5.16.0-rc7 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: dump_stack_lvl+0x4d/0x66 print_address_description.constprop.0+0x41/0x60 kasan_report.cold+0x83/0xdf ubi_eba_get_ldesc+0xfb/0x130 [ubi] ubi_update_fastmap.cold+0x60f/0xc7d [ubi] ubi_wl_get_peb+0x25b/0x4f0 [ubi] try_write_vid_and_data+0x9a/0x4d0 [ubi] ubi_eba_write_leb+0x7e4/0x17d0 [ubi] ubi_leb_map+0x1a0/0x2c0 [ubi] ubifs_leb_map+0x139/0x270 [ubifs] ubifs_add_bud_to_log+0xb40/0xf30 [ubifs] make_reservation+0x86e/0xb00 [ubifs] ubifs_jnl_write_data+0x430/0x9d0 [ubifs] do_writepage+0x1d1/0x550 [ubifs] ubifs_writepage+0x37c/0x670 [ubifs] __writepage+0x67/0x170 write_cache_pages+0x259/0xa90 do_writepages+0x277/0x5d0 __writeback_single_inode+0xb8/0x850 writeback_sb_inodes+0x4b3/0xb20 __writeback_inodes_wb+0xc1/0x220 wb_writeback+0x59f/0x740 wb_workfn+0x6d0/0xca0 process_one_work+0x711/0xfc0 worker_thread+0x95/0xd00 kthread+0x3a6/0x490 ret_from_fork+0x1f/0x30 Allocated by task 711: kasan_save_stack+0x1e/0x50 __kasan_kmalloc+0x81/0xa0 ubi_eba_create_table+0x88/0x1a0 [ubi] ubi_resize_volume.cold+0x175/0xae7 [ubi] ubi_cdev_ioctl+0x57f/0x1a60 [ubi] __x64_sys_ioctl+0x13a/0x1c0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Last potentially related work creation: kasan_save_stack+0x1e/0x50 __kasan_record_aux_stack+0xb7/0xc0 call_rcu+0xd6/0x1000 blk_stat_free_callback+0x28/0x30 blk_release_queue+0x8a/0x2e0 kobject_put+0x186/0x4c0 scsi_device_dev_release_usercontext+0x620/0xbd0 execute_in_process_context+0x2f/0x120 device_release+0xa4/0x240 kobject_put+0x186/0x4c0 put_device+0x20/0x30 __scsi_remove_device+0x1c3/0x300 scsi_probe_and_add_lun+0x2140/0x2eb0 __scsi_scan_target+0x1f2/0xbb0 scsi_scan_channel+0x11b/0x1a0 scsi_scan_host_selected+0x24c/0x310 do_scsi_scan_host+0x1e0/0x250 do_scan_async+0x45/0x490 async_run_entry_fn+0xa2/0x530 process_one_work+0x711/0xfc0 worker_thread+0x95/0xd00 kthread+0x3a6/0x490 ret_from_fork+0x1f/0x30 The buggy address belongs to the object at ffff88800f43f500 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 112 bytes inside of 128-byte region [ffff88800f43f500, ffff88800f43f580) The buggy address belongs to the page: page:ffffea00003d0f00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0xf43c head:ffffea00003d0f00 order:2 compound_mapcount:0 compound_pincount:0 flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) raw: 001fffff80010200 ffffea000046ba08 ffffea0000457208 ffff88810004d1c0 raw: 0000000000000000 0000000000190019 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88800f43f400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88800f43f480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc > ffff88800f43f500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fc ^ ffff88800f43f580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88800f43f600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc The following steps can used to reproduce: Process 1: write and trigger ubi wear-leveling ubimkvol /dev/ubi0 -s 5000MiB -N v1 ubimkvol /dev/ubi0 -s 2000MiB -N v2 ubimkvol /dev/ubi0 -s 10MiB -N v3 mount -t ubifs /dev/ubi0_0 /mnt/ubifs while true; do filename=/mnt/ubifs/$((RANDOM)) dd if=/dev/random of=${filename} bs=1M count=$((RANDOM % 1000)) rm -rf ${filename} sync /mnt/ubifs/ done Process 2: do random resize struct ubi_rsvol_req req; req.vol_id = 1; req.bytes = (rand() % 50) * 512KB; ioctl(fd, UBI_IOCRSVOL, &req); V3: - Fix the commit message error. V2: - Add volumes_lock in ubi_eba_copy_leb() to avoid race caused by updating eba_tbl. V1: - Rebase the patch on the latest mainline. Signed-off-by: Guo Xuenan Signed-off-by: ZhaoLong Wang Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/eba.c | 7 +++++++ drivers/mtd/ubi/vmt.c | 24 +++++++++++++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index 8d1f0e05892c..e5ac3cd0bbae 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -1456,7 +1456,14 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, } ubi_assert(vol->eba_tbl->entries[lnum].pnum == from); + + /** + * The volumes_lock lock is needed here to prevent the expired old eba_tbl + * being updated when the eba_tbl is copied in the ubi_resize_volume() process. + */ + spin_lock(&ubi->volumes_lock); vol->eba_tbl->entries[lnum].pnum = to; + spin_unlock(&ubi->volumes_lock); out_unlock_buf: mutex_unlock(&ubi->buf_mutex); diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 2c867d16f89f..97294def01eb 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -408,6 +408,7 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) struct ubi_device *ubi = vol->ubi; struct ubi_vtbl_record vtbl_rec; struct ubi_eba_table *new_eba_tbl = NULL; + struct ubi_eba_table *old_eba_tbl = NULL; int vol_id = vol->vol_id; if (ubi->ro_mode) @@ -453,10 +454,13 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) err = -ENOSPC; goto out_free; } + ubi->avail_pebs -= pebs; ubi->rsvd_pebs += pebs; ubi_eba_copy_table(vol, new_eba_tbl, vol->reserved_pebs); - ubi_eba_replace_table(vol, new_eba_tbl); + old_eba_tbl = vol->eba_tbl; + vol->eba_tbl = new_eba_tbl; + vol->reserved_pebs = reserved_pebs; spin_unlock(&ubi->volumes_lock); } @@ -471,7 +475,9 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) ubi->avail_pebs -= pebs; ubi_update_reserved(ubi); ubi_eba_copy_table(vol, new_eba_tbl, reserved_pebs); - ubi_eba_replace_table(vol, new_eba_tbl); + old_eba_tbl = vol->eba_tbl; + vol->eba_tbl = new_eba_tbl; + vol->reserved_pebs = reserved_pebs; spin_unlock(&ubi->volumes_lock); } @@ -493,7 +499,6 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) if (err) goto out_acc; - vol->reserved_pebs = reserved_pebs; if (vol->vol_type == UBI_DYNAMIC_VOLUME) { vol->used_ebs = reserved_pebs; vol->last_eb_bytes = vol->usable_leb_size; @@ -501,19 +506,24 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) (long long)vol->used_ebs * vol->usable_leb_size; } + /* destroy old table */ + ubi_eba_destroy_table(old_eba_tbl); ubi_volume_notify(ubi, vol, UBI_VOLUME_RESIZED); self_check_volumes(ubi); return err; out_acc: + spin_lock(&ubi->volumes_lock); + vol->reserved_pebs = reserved_pebs - pebs; if (pebs > 0) { - spin_lock(&ubi->volumes_lock); ubi->rsvd_pebs -= pebs; ubi->avail_pebs += pebs; - spin_unlock(&ubi->volumes_lock); + ubi_eba_copy_table(vol, old_eba_tbl, vol->reserved_pebs); + } else { + ubi_eba_copy_table(vol, old_eba_tbl, reserved_pebs); } - return err; - + vol->eba_tbl = old_eba_tbl; + spin_unlock(&ubi->volumes_lock); out_free: ubi_eba_destroy_table(new_eba_tbl); return err; -- cgit v1.2.3 From 9277b3a64953c09e88a33adf59fb085e0a87d357 Mon Sep 17 00:00:00 2001 From: ZhaoLong Wang Date: Sat, 13 Jan 2024 21:06:01 +0800 Subject: ubi: Correct the number of PEBs after a volume resize failure In the error handling path `out_acc` of `ubi_resize_volume()`, when `pebs < 0`, it indicates that the volume table record failed to update when the volume was shrunk. In this case, the number of `ubi->avail_pebs` and `ubi->rsvd_pebs` should be restored to their previous values to prevent the UBI layer from reporting an incorrect number of available PEBs. Signed-off-by: ZhaoLong Wang Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/vmt.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 97294def01eb..990571287e84 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -515,13 +515,12 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) out_acc: spin_lock(&ubi->volumes_lock); vol->reserved_pebs = reserved_pebs - pebs; - if (pebs > 0) { - ubi->rsvd_pebs -= pebs; - ubi->avail_pebs += pebs; + ubi->rsvd_pebs -= pebs; + ubi->avail_pebs += pebs; + if (pebs > 0) ubi_eba_copy_table(vol, old_eba_tbl, vol->reserved_pebs); - } else { + else ubi_eba_copy_table(vol, old_eba_tbl, reserved_pebs); - } vol->eba_tbl = old_eba_tbl; spin_unlock(&ubi->volumes_lock); out_free: -- cgit v1.2.3 From 31a9d5f3290c5483d41c6b5c8ebc0add130da770 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 8 Jan 2024 10:41:04 +0800 Subject: ubifs: dbg_check_idx_size: Fix kmemleak if loading znode failed If function dbg_check_idx_size() failed by loading znode in mounting process, there are two problems: 1. Allocated znodes won't be freed, which causes kmemleak in kernel: ubifs_mount dbg_check_idx_size dbg_walk_index c->zroot.znode = ubifs_load_znode child = ubifs_load_znode // failed // Loaded znodes won't be freed in error handling path. 2. Global variable ubifs_clean_zn_cnt is not decreased, because ubifs_tnc_close() is not invoked in error handling path, which triggers a warning in ubifs_exit(): WARNING: CPU: 1 PID: 1576 at fs/ubifs/super.c:2486 ubifs_exit Modules linked in: zstd ubifs(-) ubi nandsim CPU: 1 PID: 1576 Comm: rmmod Not tainted 6.7.0-rc6 Call Trace: ubifs_exit+0xca/0xc70 [ubifs] __do_sys_delete_module+0x29a/0x4a0 do_syscall_64+0x6f/0x140 Fix it by adding error handling path in dbg_check_idx_size() to release tnc tree. Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Suggested-by: Richard Weinberger Signed-off-by: Richard Weinberger --- fs/ubifs/debug.c | 9 +++++++-- fs/ubifs/tnc.c | 9 +-------- fs/ubifs/tnc_misc.c | 22 ++++++++++++++++++++++ fs/ubifs/ubifs.h | 1 + 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index d013c5b3f1ed..ac77ac1fd73e 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -1742,17 +1742,22 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) err = dbg_walk_index(c, NULL, add_size, &calc); if (err) { ubifs_err(c, "error %d while walking the index", err); - return err; + goto out_err; } if (calc != idx_size) { ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld", calc, idx_size); dump_stack(); - return -EINVAL; + err = -EINVAL; + goto out_err; } return 0; + +out_err: + ubifs_destroy_tnc_tree(c); + return err; } /** diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f4728e65d1bd..45cacdcd4746 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -3116,14 +3116,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c) void ubifs_tnc_close(struct ubifs_info *c) { tnc_destroy_cnext(c); - if (c->zroot.znode) { - long n, freed; - - n = atomic_long_read(&c->clean_zn_cnt); - freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode); - ubifs_assert(c, freed == n); - atomic_long_sub(n, &ubifs_clean_zn_cnt); - } + ubifs_destroy_tnc_tree(c); kfree(c->gap_lebs); kfree(c->ilebs); destroy_old_idx(c); diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index 4d686e34e64d..d3f8a6aa1f49 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -250,6 +250,28 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, } } +/** + * ubifs_destroy_tnc_tree - destroy all znodes connected to the TNC tree. + * @c: UBIFS file-system description object + * + * This function destroys the whole TNC tree and updates clean global znode + * count. + */ +void ubifs_destroy_tnc_tree(struct ubifs_info *c) +{ + long n, freed; + + if (!c->zroot.znode) + return; + + n = atomic_long_read(&c->clean_zn_cnt); + freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode); + ubifs_assert(c, freed == n); + atomic_long_sub(n, &ubifs_clean_zn_cnt); + + c->zroot.znode = NULL; +} + /** * read_znode - read an indexing node from flash and fill znode. * @c: UBIFS file-system description object diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 3916dc4f30ca..6eba287ae66c 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1903,6 +1903,7 @@ struct ubifs_znode *ubifs_tnc_postorder_next(const struct ubifs_info *c, struct ubifs_znode *znode); long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, struct ubifs_znode *zr); +void ubifs_destroy_tnc_tree(struct ubifs_info *c); struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr, struct ubifs_znode *parent, int iip); -- cgit v1.2.3 From 6379b44cdcd67f5f5d986b73953e99700591edfa Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 8 Jan 2024 10:41:05 +0800 Subject: ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path For error handling path in ubifs_symlink(), inode will be marked as bad first, then iput() is invoked. If inode->i_link is initialized by fscrypt_encrypt_symlink() in encryption scenario, inode->i_link won't be freed by callchain ubifs_free_inode -> fscrypt_free_inode in error handling path, because make_bad_inode() has changed 'inode->i_mode' as 'S_IFREG'. Following kmemleak is easy to be reproduced by injecting error in ubifs_jnl_update() when doing symlink in encryption scenario: unreferenced object 0xffff888103da3d98 (size 8): comm "ln", pid 1692, jiffies 4294914701 (age 12.045s) backtrace: kmemdup+0x32/0x70 __fscrypt_encrypt_symlink+0xed/0x1c0 ubifs_symlink+0x210/0x300 [ubifs] vfs_symlink+0x216/0x360 do_symlinkat+0x11a/0x190 do_syscall_64+0x3b/0xe0 There are two ways fixing it: 1. Remove make_bad_inode() in error handling path. We can do that because ubifs_evict_inode() will do same processes for good symlink inode and bad symlink inode, for inode->i_nlink checking is before is_bad_inode(). 2. Free inode->i_link before marking inode bad. Method 2 is picked, it has less influence, personally, I think. Cc: stable@vger.kernel.org Fixes: 2c58d548f570 ("fscrypt: cache decrypted symlink target in ->i_link") Signed-off-by: Zhihao Cheng Suggested-by: Eric Biggers Reviewed-by: Eric Biggers Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e413a9cf8ee3..6b3db00d9b12 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1134,6 +1134,8 @@ out_cancel: dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); out_inode: + /* Free inode->i_link before inode is marked as bad. */ + fscrypt_free_inode(inode); make_bad_inode(inode); iput(inode); out_fname: -- cgit v1.2.3 From 556c19f563b64dd5463b0030d19c8681f4f7446e Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 22 Jan 2024 14:31:03 +0800 Subject: ubifs: Queue up space reservation tasks if retrying many times Recently we catched ENOSPC returned by make_reservation() while doing fsstress on UBIFS, we got following information when it occurred (See details in Link): UBIFS error (ubi0:0 pid 3640152): make_reservation [ubifs]: cannot reserve 112 bytes in jhead 2, error -28 CPU: 2 PID: 3640152 Comm: kworker/u16:2 Tainted: G B W Hardware name: Hisilicon PhosphorHi1230 EMU (DT) Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call trace: dump_stack+0x114/0x198 make_reservation+0x564/0x610 [ubifs] ubifs_jnl_write_data+0x328/0x48c [ubifs] do_writepage+0x2a8/0x3e4 [ubifs] ubifs_writepage+0x16c/0x374 [ubifs] generic_writepages+0xb4/0x114 do_writepages+0xcc/0x11c writeback_sb_inodes+0x2d0/0x564 wb_writeback+0x20c/0x2b4 wb_workfn+0x404/0x510 process_one_work+0x304/0x4ac worker_thread+0x31c/0x4e4 kthread+0x23c/0x290 Budgeting info: data budget sum 17576, total budget sum 17768 budg_data_growth 4144, budg_dd_growth 13432, budg_idx_growth 192 min_idx_lebs 13, old_idx_sz 988640, uncommitted_idx 0 page_budget 4144, inode_budget 160, dent_budget 312 nospace 0, nospace_rp 0 dark_wm 8192, dead_wm 4096, max_idx_node_sz 192 freeable_cnt 0, calc_idx_sz 988640, idx_gc_cnt 0 dirty_pg_cnt 4, dirty_zn_cnt 0, clean_zn_cnt 4811 gc_lnum 21, ihead_lnum 14 jhead 0 (GC) LEB 16 jhead 1 (base) LEB 34 jhead 2 (data) LEB 23 bud LEB 16 bud LEB 23 bud LEB 34 old bud LEB 33 old bud LEB 31 old bud LEB 15 commit state 4 Budgeting predictions: available: 33832, outstanding 17576, free 15356 (pid 3640152) start dumping LEB properties (pid 3640152) Lprops statistics: empty_lebs 3, idx_lebs 11 taken_empty_lebs 1, total_free 1253376, total_dirty 2445736 total_used 3438712, total_dark 65536, total_dead 17248 LEB 15 free 0 dirty 248000 used 5952 (taken) LEB 16 free 110592 dirty 896 used 142464 (taken, jhead 0 (GC)) LEB 21 free 253952 dirty 0 used 0 (taken, GC LEB) LEB 23 free 0 dirty 248104 used 5848 (taken, jhead 2 (data)) LEB 29 free 253952 dirty 0 used 0 (empty) LEB 33 free 0 dirty 253952 used 0 (taken) LEB 34 free 217088 dirty 36544 used 320 (taken, jhead 1 (base)) LEB 37 free 253952 dirty 0 used 0 (empty) OTHERS: index lebs, zero-available non-index lebs According to the budget algorithm, there are 5 LEBs reserved for budget: three journal heads(16,23,34), 1 GC LEB(21) and 1 deletion LEB(can be used in make_reservation()). There are 2 empty LEBs used for index nodes, which is calculated as min_idx_lebs - idx_lebs = 2. In theory, LEB 15 and 33 should be reclaimed as free state after committing, but it is now in taken state. After looking the realization of reserve_space(), there's a possible situation: LEB 15: free 2000 dirty 248000 used 3952 (jhead 2) LEB 23: free 2000 dirty 248104 used 3848 (bud, taken) LEB 33: free 2000 dirty 251952 used 0 (bud, taken) wb_workfn wb_workfn_2 do_writepage // write 3000 bytes ubifs_jnl_write_data make_reservation reserve_space ubifs_garbage_collect ubifs_find_dirty_leb // ret ENOSPC, dirty LEBs are taken nospc_retries++ // 1 ubifs_run_commit do_commit LEB 15: free 2000 dirty 248000 used 3952 (jhead 2) LEB 23: free 2000 dirty 248104 used 3848 (dirty) LEB 33: free 2000 dirty 251952 used 0 (dirty) do_writepage // write 2000 bytes for 3 times ubifs_jnl_write_data // grabs 15\23\33 LEB 15: free 0 dirty 248000 used 5952 (bud, taken) LEB 23: free 0 dirty 248104 used 5848 (jhead 2) LEB 33: free 0 dirty 253952 used 0 (bud, taken) reserve_space ubifs_garbage_collect ubifs_find_dirty_leb // ret ENOSPC, dirty LEBs are taken if (nospc_retries++ < 2) // false ubifs_ro_mode ! Fetch a reproducer in Link. The dirty LEBs could be grabbed by other threads, which fails finding dirty LEBs of GC in current thread, so make_reservation() could try many times to invoke GC&&committing, but current realization limits the times of retrying as 'nospc_retries'(twice). Fix it by adding a wait queue, start queuing up space reservation tasks when someone task has retried gc + commit for many times. Then there is only one task making space reservation at any time, and it can always make success under the premise of correct budgeting. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218164 Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Signed-off-by: Zhang Yi Signed-off-by: Richard Weinberger --- fs/ubifs/journal.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/ubifs/super.c | 2 + fs/ubifs/ubifs.h | 4 ++ 3 files changed, 155 insertions(+), 22 deletions(-) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index f0a5538c84b0..74aee92433d7 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -292,6 +292,96 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, return err; } +/** + * __queue_and_wait - queue a task and wait until the task is waked up. + * @c: UBIFS file-system description object + * + * This function adds current task in queue and waits until the task is waked + * up. This function should be called with @c->reserve_space_wq locked. + */ +static void __queue_and_wait(struct ubifs_info *c) +{ + DEFINE_WAIT(wait); + + __add_wait_queue_entry_tail_exclusive(&c->reserve_space_wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&c->reserve_space_wq.lock); + + schedule(); + finish_wait(&c->reserve_space_wq, &wait); +} + +/** + * wait_for_reservation - try queuing current task to wait until waked up. + * @c: UBIFS file-system description object + * + * This function queues current task to wait until waked up, if queuing is + * started(@c->need_wait_space is not %0). Returns %true if current task is + * added in queue, otherwise %false is returned. + */ +static bool wait_for_reservation(struct ubifs_info *c) +{ + if (likely(atomic_read(&c->need_wait_space) == 0)) + /* Quick path to check whether queuing is started. */ + return false; + + spin_lock(&c->reserve_space_wq.lock); + if (atomic_read(&c->need_wait_space) == 0) { + /* Queuing is not started, don't queue current task. */ + spin_unlock(&c->reserve_space_wq.lock); + return false; + } + + __queue_and_wait(c); + return true; +} + +/** + * wake_up_reservation - wake up first task in queue or stop queuing. + * @c: UBIFS file-system description object + * + * This function wakes up the first task in queue if it exists, or stops + * queuing if no tasks in queue. + */ +static void wake_up_reservation(struct ubifs_info *c) +{ + spin_lock(&c->reserve_space_wq.lock); + if (waitqueue_active(&c->reserve_space_wq)) + wake_up_locked(&c->reserve_space_wq); + else + /* + * Compared with wait_for_reservation(), set @c->need_wait_space + * under the protection of wait queue lock, which can avoid that + * @c->need_wait_space is set to 0 after new task queued. + */ + atomic_set(&c->need_wait_space, 0); + spin_unlock(&c->reserve_space_wq.lock); +} + +/** + * wake_up_reservation - add current task in queue or start queuing. + * @c: UBIFS file-system description object + * + * This function starts queuing if queuing is not started, otherwise adds + * current task in queue. + */ +static void add_or_start_queue(struct ubifs_info *c) +{ + spin_lock(&c->reserve_space_wq.lock); + if (atomic_cmpxchg(&c->need_wait_space, 0, 1) == 0) { + /* Starts queuing, task can go on directly. */ + spin_unlock(&c->reserve_space_wq.lock); + return; + } + + /* + * There are at least two tasks have retried more than 32 times + * at certain point, first task has started queuing, just queue + * the left tasks. + */ + __queue_and_wait(c); +} + /** * make_reservation - reserve journal space. * @c: UBIFS file-system description object @@ -311,33 +401,27 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, static int make_reservation(struct ubifs_info *c, int jhead, int len) { int err, cmt_retries = 0, nospc_retries = 0; + bool blocked = wait_for_reservation(c); again: down_read(&c->commit_sem); err = reserve_space(c, jhead, len); - if (!err) + if (!err) { /* c->commit_sem will get released via finish_reservation(). */ - return 0; + goto out_wake_up; + } up_read(&c->commit_sem); if (err == -ENOSPC) { /* * GC could not make any progress. We should try to commit - * once because it could make some dirty space and GC would - * make progress, so make the error -EAGAIN so that the below + * because it could make some dirty space and GC would make + * progress, so make the error -EAGAIN so that the below * will commit and re-try. */ - if (nospc_retries++ < 2) { - dbg_jnl("no space, retry"); - err = -EAGAIN; - } - - /* - * This means that the budgeting is incorrect. We always have - * to be able to write to the media, because all operations are - * budgeted. Deletions are not budgeted, though, but we reserve - * an extra LEB for them. - */ + nospc_retries++; + dbg_jnl("no space, retry"); + err = -EAGAIN; } if (err != -EAGAIN) @@ -349,15 +433,37 @@ again: */ if (cmt_retries > 128) { /* - * This should not happen unless the journal size limitations - * are too tough. + * This should not happen unless: + * 1. The journal size limitations are too tough. + * 2. The budgeting is incorrect. We always have to be able to + * write to the media, because all operations are budgeted. + * Deletions are not budgeted, though, but we reserve an + * extra LEB for them. */ - ubifs_err(c, "stuck in space allocation"); + ubifs_err(c, "stuck in space allocation, nospc_retries %d", + nospc_retries); err = -ENOSPC; goto out; - } else if (cmt_retries > 32) - ubifs_warn(c, "too many space allocation re-tries (%d)", - cmt_retries); + } else if (cmt_retries > 32) { + /* + * It's almost impossible to happen, unless there are many tasks + * making reservation concurrently and someone task has retried + * gc + commit for many times, generated available space during + * this period are grabbed by other tasks. + * But if it happens, start queuing up all tasks that will make + * space reservation, then there is only one task making space + * reservation at any time, and it can always make success under + * the premise of correct budgeting. + */ + ubifs_warn(c, "too many space allocation cmt_retries (%d) " + "nospc_retries (%d), start queuing tasks", + cmt_retries, nospc_retries); + + if (!blocked) { + blocked = true; + add_or_start_queue(c); + } + } dbg_jnl("-EAGAIN, commit and retry (retried %d times)", cmt_retries); @@ -365,7 +471,7 @@ again: err = ubifs_run_commit(c); if (err) - return err; + goto out_wake_up; goto again; out: @@ -380,6 +486,27 @@ out: cmt_retries = dbg_check_lprops(c); up_write(&c->commit_sem); } +out_wake_up: + if (blocked) { + /* + * Only tasks that have ever started queuing or ever been queued + * can wake up other queued tasks, which can make sure that + * there is only one task waked up to make space reservation. + * For example: + * task A task B task C + * make_reservation make_reservation + * reserve_space // 0 + * wake_up_reservation + * atomic_cmpxchg // 0, start queuing + * reserve_space + * wait_for_reservation + * __queue_and_wait + * add_wait_queue + * if (blocked) // false + * // So that task C won't be waked up to race with task B + */ + wake_up_reservation(c); + } return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 09e270d6ed02..571c9dc92b94 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2151,6 +2151,8 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) mutex_init(&c->bu_mutex); mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); + init_waitqueue_head(&c->reserve_space_wq); + atomic_set(&c->need_wait_space, 0); c->buds = RB_ROOT; c->old_idx = RB_ROOT; c->size_tree = RB_ROOT; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 6eba287ae66c..1f3ea879d93a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1047,6 +1047,8 @@ struct ubifs_debug_info; * @bg_bud_bytes: number of bud bytes when background commit is initiated * @old_buds: buds to be released after commit ends * @max_bud_cnt: maximum number of buds + * @need_wait_space: Non %0 means space reservation tasks need to wait in queue + * @reserve_space_wq: wait queue to sleep on if @need_wait_space is not %0 * * @commit_sem: synchronizes committer with other processes * @cmt_state: commit state @@ -1305,6 +1307,8 @@ struct ubifs_info { long long bg_bud_bytes; struct list_head old_buds; int max_bud_cnt; + atomic_t need_wait_space; + wait_queue_head_t reserve_space_wq; struct rw_semaphore commit_sem; int cmt_state; -- cgit v1.2.3 From e17f38b736691de1cf9e842c748e2960341c9870 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:32:00 +0000 Subject: dt-bindings: mtd: add basic bindings for UBI Add basic bindings for UBI devices and volumes. Signed-off-by: Daniel Golle Reviewed-by: Rob Herring Signed-off-by: Richard Weinberger --- .../bindings/mtd/partitions/linux,ubi.yaml | 65 ++++++++++++++++++++++ .../bindings/mtd/partitions/ubi-volume.yaml | 35 ++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml create mode 100644 Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml diff --git a/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml b/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml new file mode 100644 index 000000000000..7084a1945b31 --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mtd/partitions/linux,ubi.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Unsorted Block Images + +description: | + UBI ("Unsorted Block Images") is a volume management system for raw + flash devices which manages multiple logical volumes on a single + physical flash device and spreads the I/O load (i.e wear-leveling) + across the whole flash chip. + +maintainers: + - Daniel Golle + +allOf: + - $ref: partition.yaml# + +properties: + compatible: + const: linux,ubi + + volumes: + type: object + description: UBI Volumes + + patternProperties: + "^ubi-volume-.*$": + $ref: /schemas/mtd/partitions/ubi-volume.yaml# + + unevaluatedProperties: false + +required: + - compatible + +unevaluatedProperties: false + +examples: + - | + partitions { + compatible = "fixed-partitions"; + #address-cells = <1>; + #size-cells = <1>; + + partition@0 { + reg = <0x0 0x100000>; + label = "bootloader"; + read-only; + }; + + partition@100000 { + reg = <0x100000 0x1ff00000>; + label = "ubi"; + compatible = "linux,ubi"; + + volumes { + ubi-volume-caldata { + volid = <2>; + volname = "rf"; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml b/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml new file mode 100644 index 000000000000..1e3f04dedc01 --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mtd/partitions/ubi-volume.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: UBI volume + +description: | + This binding describes a single UBI volume. Volumes can be matches either + by their ID or their name, or both. + +maintainers: + - Daniel Golle + +properties: + volid: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Match UBI volume ID + + volname: + $ref: /schemas/types.yaml#/definitions/string + description: + Match UBI volume ID + +anyOf: + - required: + - volid + + - required: + - volname + +# This is a generic file other binding inherit from and extend +additionalProperties: true -- cgit v1.2.3 From a1de28dd203176bb9cf62576a90d761aa57fe924 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:32:11 +0000 Subject: dt-bindings: mtd: ubi-volume: allow UBI volumes to provide NVMEM UBI volumes may be used to contain NVMEM bits, typically device MAC addresses or wireless radio calibration data. Signed-off-by: Daniel Golle Reviewed-by: Rob Herring Signed-off-by: Richard Weinberger --- .../devicetree/bindings/mtd/partitions/linux,ubi.yaml | 10 ++++++++++ .../devicetree/bindings/mtd/partitions/ubi-volume.yaml | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml b/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml index 7084a1945b31..27e1ac1f252e 100644 --- a/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml +++ b/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml @@ -59,6 +59,16 @@ examples: ubi-volume-caldata { volid = <2>; volname = "rf"; + + nvmem-layout { + compatible = "fixed-layout"; + #address-cells = <1>; + #size-cells = <1>; + + eeprom@0 { + reg = <0x0 0x1000>; + }; + }; }; }; }; diff --git a/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml b/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml index 1e3f04dedc01..19736b26056b 100644 --- a/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml +++ b/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml @@ -24,6 +24,11 @@ properties: description: Match UBI volume ID + nvmem-layout: + $ref: /schemas/nvmem/layouts/nvmem-layout.yaml# + description: + This container may reference an NVMEM layout parser. + anyOf: - required: - volid -- cgit v1.2.3 From 762d73cd930e3073e927b2ec0811519bde2c8fb4 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:32:35 +0000 Subject: mtd: ubi: block: use notifier to create ubiblock from parameter Use UBI_VOLUME_ADDED notification to create ubiblock device specified on kernel cmdline or module parameter. This makes thing more simple and has the advantage that ubiblock devices on volumes which are not present at the time the ubi module is probed will still be created. Suggested-by: Zhihao Cheng Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/block.c | 136 ++++++++++++++++++++++++------------------------ drivers/mtd/ubi/kapi.c | 54 +++++++++++++------ drivers/mtd/ubi/ubi.h | 1 + 3 files changed, 106 insertions(+), 85 deletions(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 654bd7372cd8..b52ff32f624a 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -65,10 +65,10 @@ struct ubiblock_pdu { }; /* Numbers of elements set in the @ubiblock_param array */ -static int ubiblock_devs __initdata; +static int ubiblock_devs; /* MTD devices specification parameters */ -static struct ubiblock_param ubiblock_param[UBIBLOCK_MAX_DEVICES] __initdata; +static struct ubiblock_param ubiblock_param[UBIBLOCK_MAX_DEVICES]; struct ubiblock { struct ubi_volume_desc *desc; @@ -534,6 +534,70 @@ static int ubiblock_resize(struct ubi_volume_info *vi) return 0; } +static bool +match_volume_desc(struct ubi_volume_info *vi, const char *name, int ubi_num, int vol_id) +{ + int err, len, cur_ubi_num, cur_vol_id; + + if (ubi_num == -1) { + /* No ubi num, name must be a vol device path */ + err = ubi_get_num_by_path(name, &cur_ubi_num, &cur_vol_id); + if (err || vi->ubi_num != cur_ubi_num || vi->vol_id != cur_vol_id) + return false; + + return true; + } + + if (vol_id == -1) { + /* Got ubi_num, but no vol_id, name must be volume name */ + if (vi->ubi_num != ubi_num) + return false; + + len = strnlen(name, UBI_VOL_NAME_MAX + 1); + if (len < 1 || vi->name_len != len) + return false; + + if (strcmp(name, vi->name)) + return false; + + return true; + } + + if (vi->ubi_num != ubi_num) + return false; + + if (vi->vol_id != vol_id) + return false; + + return true; +} + +static void +ubiblock_create_from_param(struct ubi_volume_info *vi) +{ + int i, ret = 0; + struct ubiblock_param *p; + + /* + * Iterate over ubiblock cmdline parameters. If a parameter matches the + * newly added volume create the ubiblock device for it. + */ + for (i = 0; i < ubiblock_devs; i++) { + p = &ubiblock_param[i]; + + if (!match_volume_desc(vi, p->name, p->ubi_num, p->vol_id)) + continue; + + ret = ubiblock_create(vi); + if (ret) { + pr_err( + "UBI: block: can't add '%s' volume on ubi%d_%d, err=%d\n", + vi->name, p->ubi_num, p->vol_id, ret); + } + break; + } +} + static int ubiblock_notify(struct notifier_block *nb, unsigned long notification_type, void *ns_ptr) { @@ -541,10 +605,7 @@ static int ubiblock_notify(struct notifier_block *nb, switch (notification_type) { case UBI_VOLUME_ADDED: - /* - * We want to enforce explicit block device creation for - * volumes, so when a volume is added we do nothing. - */ + ubiblock_create_from_param(&nt->vi); break; case UBI_VOLUME_REMOVED: ubiblock_remove(&nt->vi); @@ -570,56 +631,6 @@ static struct notifier_block ubiblock_notifier = { .notifier_call = ubiblock_notify, }; -static struct ubi_volume_desc * __init -open_volume_desc(const char *name, int ubi_num, int vol_id) -{ - if (ubi_num == -1) - /* No ubi num, name must be a vol device path */ - return ubi_open_volume_path(name, UBI_READONLY); - else if (vol_id == -1) - /* No vol_id, must be vol_name */ - return ubi_open_volume_nm(ubi_num, name, UBI_READONLY); - else - return ubi_open_volume(ubi_num, vol_id, UBI_READONLY); -} - -static void __init ubiblock_create_from_param(void) -{ - int i, ret = 0; - struct ubiblock_param *p; - struct ubi_volume_desc *desc; - struct ubi_volume_info vi; - - /* - * If there is an error creating one of the ubiblocks, continue on to - * create the following ubiblocks. This helps in a circumstance where - * the kernel command-line specifies multiple block devices and some - * may be broken, but we still want the working ones to come up. - */ - for (i = 0; i < ubiblock_devs; i++) { - p = &ubiblock_param[i]; - - desc = open_volume_desc(p->name, p->ubi_num, p->vol_id); - if (IS_ERR(desc)) { - pr_err( - "UBI: block: can't open volume on ubi%d_%d, err=%ld\n", - p->ubi_num, p->vol_id, PTR_ERR(desc)); - continue; - } - - ubi_get_volume_info(desc, &vi); - ubi_close_volume(desc); - - ret = ubiblock_create(&vi); - if (ret) { - pr_err( - "UBI: block: can't add '%s' volume on ubi%d_%d, err=%d\n", - vi.name, p->ubi_num, p->vol_id, ret); - continue; - } - } -} - static void ubiblock_remove_all(void) { struct ubiblock *next; @@ -645,18 +656,7 @@ int __init ubiblock_init(void) if (ubiblock_major < 0) return ubiblock_major; - /* - * Attach block devices from 'block=' module param. - * Even if one block device in the param list fails to come up, - * still allow the module to load and leave any others up. - */ - ubiblock_create_from_param(); - - /* - * Block devices are only created upon user requests, so we ignore - * existing volumes. - */ - ret = ubi_register_volume_notifier(&ubiblock_notifier, 1); + ret = ubi_register_volume_notifier(&ubiblock_notifier, 0); if (ret) goto err_unreg; return 0; diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c index 5db653eacbd4..fbf3a7fe2af7 100644 --- a/drivers/mtd/ubi/kapi.c +++ b/drivers/mtd/ubi/kapi.c @@ -279,6 +279,41 @@ struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name, } EXPORT_SYMBOL_GPL(ubi_open_volume_nm); +/** + * ubi_get_num_by_path - get UBI device and volume number from device path + * @pathname: volume character device node path + * @ubi_num: pointer to UBI device number to be set + * @vol_id: pointer to UBI volume ID to be set + * + * Returns 0 on success and sets ubi_num and vol_id, returns error otherwise. + */ +int ubi_get_num_by_path(const char *pathname, int *ubi_num, int *vol_id) +{ + int error; + struct path path; + struct kstat stat; + + error = kern_path(pathname, LOOKUP_FOLLOW, &path); + if (error) + return error; + + error = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); + path_put(&path); + if (error) + return error; + + if (!S_ISCHR(stat.mode)) + return -EINVAL; + + *ubi_num = ubi_major2num(MAJOR(stat.rdev)); + *vol_id = MINOR(stat.rdev) - 1; + + if (*vol_id < 0 || *ubi_num < 0) + return -ENODEV; + + return 0; +} + /** * ubi_open_volume_path - open UBI volume by its character device node path. * @pathname: volume character device node path @@ -290,32 +325,17 @@ EXPORT_SYMBOL_GPL(ubi_open_volume_nm); struct ubi_volume_desc *ubi_open_volume_path(const char *pathname, int mode) { int error, ubi_num, vol_id; - struct path path; - struct kstat stat; dbg_gen("open volume %s, mode %d", pathname, mode); if (!pathname || !*pathname) return ERR_PTR(-EINVAL); - error = kern_path(pathname, LOOKUP_FOLLOW, &path); - if (error) - return ERR_PTR(error); - - error = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); - path_put(&path); + error = ubi_get_num_by_path(pathname, &ubi_num, &vol_id); if (error) return ERR_PTR(error); - if (!S_ISCHR(stat.mode)) - return ERR_PTR(-EINVAL); - - ubi_num = ubi_major2num(MAJOR(stat.rdev)); - vol_id = MINOR(stat.rdev) - 1; - - if (vol_id >= 0 && ubi_num >= 0) - return ubi_open_volume(ubi_num, vol_id, mode); - return ERR_PTR(-ENODEV); + return ubi_open_volume(ubi_num, vol_id, mode); } EXPORT_SYMBOL_GPL(ubi_open_volume_path); diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 0b42bb45dd84..a588381c50ad 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -955,6 +955,7 @@ void ubi_free_internal_volumes(struct ubi_device *ubi); void ubi_do_get_device_info(struct ubi_device *ubi, struct ubi_device_info *di); void ubi_do_get_volume_info(struct ubi_device *ubi, struct ubi_volume *vol, struct ubi_volume_info *vi); +int ubi_get_num_by_path(const char *pathname, int *ubi_num, int *vol_id); /* scan.c */ int ubi_compare_lebs(struct ubi_device *ubi, const struct ubi_ainf_peb *aeb, int pnum, const struct ubi_vid_hdr *vid_hdr); -- cgit v1.2.3 From 927c145208b04490fb40c5c88a1086b592bfac25 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:33:14 +0000 Subject: mtd: ubi: attach from device tree Introduce device tree compatible 'linux,ubi' and attach compatible MTD devices using the MTD add notifier. This is needed for a UBI device to be available early at boot (and not only after late_initcall), so volumes on them can be used eg. as NVMEM providers for other drivers. Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 135 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 96 insertions(+), 39 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 7d4ff1193db6..8c3f763e4ddb 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include "ubi.h" @@ -1219,43 +1220,43 @@ static struct mtd_info * __init open_mtd_device(const char *mtd_dev) return mtd; } -static int __init ubi_init(void) +static void ubi_notify_add(struct mtd_info *mtd) { - int err, i, k; + struct device_node *np = mtd_get_of_node(mtd); + int err; - /* Ensure that EC and VID headers have correct size */ - BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); - BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); + if (!of_device_is_compatible(np, "linux,ubi")) + return; - if (mtd_devs > UBI_MAX_DEVICES) { - pr_err("UBI error: too many MTD devices, maximum is %d\n", - UBI_MAX_DEVICES); - return -EINVAL; - } + /* + * we are already holding &mtd_table_mutex, but still need + * to bump refcount + */ + err = __get_mtd_device(mtd); + if (err) + return; - /* Create base sysfs directory and sysfs files */ - err = class_register(&ubi_class); + /* called while holding mtd_table_mutex */ + mutex_lock_nested(&ubi_devices_mutex, SINGLE_DEPTH_NESTING); + err = ubi_attach_mtd_dev(mtd, UBI_DEV_NUM_AUTO, 0, 0, false, false); + mutex_unlock(&ubi_devices_mutex); if (err < 0) - return err; - - err = misc_register(&ubi_ctrl_cdev); - if (err) { - pr_err("UBI error: cannot register device\n"); - goto out; - } + __put_mtd_device(mtd); +} - ubi_wl_entry_slab = kmem_cache_create("ubi_wl_entry_slab", - sizeof(struct ubi_wl_entry), - 0, 0, NULL); - if (!ubi_wl_entry_slab) { - err = -ENOMEM; - goto out_dev_unreg; - } +static void ubi_notify_remove(struct mtd_info *mtd) +{ + /* do nothing for now */ +} - err = ubi_debugfs_init(); - if (err) - goto out_slab; +static struct mtd_notifier ubi_mtd_notifier = { + .add = ubi_notify_add, + .remove = ubi_notify_remove, +}; +static int __init ubi_init_attach(void) +{ + int err, i, k; /* Attach MTD devices */ for (i = 0; i < mtd_devs; i++) { @@ -1304,25 +1305,79 @@ static int __init ubi_init(void) } } + return 0; + +out_detach: + for (k = 0; k < i; k++) + if (ubi_devices[k]) { + mutex_lock(&ubi_devices_mutex); + ubi_detach_mtd_dev(ubi_devices[k]->ubi_num, 1); + mutex_unlock(&ubi_devices_mutex); + } + return err; +} +#ifndef CONFIG_MTD_UBI_MODULE +late_initcall(ubi_init_attach); +#endif + +static int __init ubi_init(void) +{ + int err; + + /* Ensure that EC and VID headers have correct size */ + BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); + BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); + + if (mtd_devs > UBI_MAX_DEVICES) { + pr_err("UBI error: too many MTD devices, maximum is %d\n", + UBI_MAX_DEVICES); + return -EINVAL; + } + + /* Create base sysfs directory and sysfs files */ + err = class_register(&ubi_class); + if (err < 0) + return err; + + err = misc_register(&ubi_ctrl_cdev); + if (err) { + pr_err("UBI error: cannot register device\n"); + goto out; + } + + ubi_wl_entry_slab = kmem_cache_create("ubi_wl_entry_slab", + sizeof(struct ubi_wl_entry), + 0, 0, NULL); + if (!ubi_wl_entry_slab) { + err = -ENOMEM; + goto out_dev_unreg; + } + + err = ubi_debugfs_init(); + if (err) + goto out_slab; + err = ubiblock_init(); if (err) { pr_err("UBI error: block: cannot initialize, error %d\n", err); /* See comment above re-ubi_is_module(). */ if (ubi_is_module()) - goto out_detach; + goto out_slab; + } + + register_mtd_user(&ubi_mtd_notifier); + + if (ubi_is_module()) { + err = ubi_init_attach(); + if (err) + goto out_mtd_notifier; } return 0; -out_detach: - for (k = 0; k < i; k++) - if (ubi_devices[k]) { - mutex_lock(&ubi_devices_mutex); - ubi_detach_mtd_dev(ubi_devices[k]->ubi_num, 1); - mutex_unlock(&ubi_devices_mutex); - } - ubi_debugfs_exit(); +out_mtd_notifier: + unregister_mtd_user(&ubi_mtd_notifier); out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: @@ -1332,13 +1387,15 @@ out: pr_err("UBI error: cannot initialize UBI, error %d\n", err); return err; } -late_initcall(ubi_init); +device_initcall(ubi_init); + static void __exit ubi_exit(void) { int i; ubiblock_exit(); + unregister_mtd_user(&ubi_mtd_notifier); for (i = 0; i < UBI_MAX_DEVICES; i++) if (ubi_devices[i]) { -- cgit v1.2.3 From 7e84c961b2eb062d2f47037dcca52dcd1d3615b5 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:33:24 +0000 Subject: mtd: ubi: introduce pre-removal notification for UBI volumes Introduce a new notification type UBI_VOLUME_SHUTDOWN to inform users that a volume is just about to be removed. This is needed because users (such as the NVMEM subsystem) expect that at the time their removal function is called, the parenting device is still available (for removal of sysfs nodes, for example, in case of NVMEM which otherwise WARNs on volume removal). Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 19 ++++++++++++++----- drivers/mtd/ubi/kapi.c | 2 +- drivers/mtd/ubi/ubi.h | 2 ++ drivers/mtd/ubi/vmt.c | 17 +++++++++++++++-- include/linux/mtd/ubi.h | 2 ++ 5 files changed, 34 insertions(+), 8 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 8c3f763e4ddb..a7e3a6246c0e 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -93,7 +93,7 @@ static struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; /* Serializes UBI devices creations and removals */ DEFINE_MUTEX(ubi_devices_mutex); -/* Protects @ubi_devices and @ubi->ref_count */ +/* Protects @ubi_devices, @ubi->ref_count and @ubi->is_dead */ static DEFINE_SPINLOCK(ubi_devices_lock); /* "Show" method for files in '//class/ubi/' */ @@ -261,6 +261,9 @@ struct ubi_device *ubi_get_device(int ubi_num) spin_lock(&ubi_devices_lock); ubi = ubi_devices[ubi_num]; + if (ubi && ubi->is_dead) + ubi = NULL; + if (ubi) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; @@ -298,7 +301,7 @@ struct ubi_device *ubi_get_by_major(int major) spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; - if (ubi && MAJOR(ubi->cdev.dev) == major) { + if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); @@ -327,7 +330,7 @@ int ubi_major2num(int major) for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; - if (ubi && MAJOR(ubi->cdev.dev) == major) { + if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_num = ubi->ubi_num; break; } @@ -514,7 +517,7 @@ static void ubi_free_volumes_from(struct ubi_device *ubi, int from) int i; for (i = from; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { - if (!ubi->volumes[i]) + if (!ubi->volumes[i] || ubi->volumes[i]->is_dead) continue; ubi_eba_replace_table(ubi->volumes[i], NULL); ubi_fastmap_destroy_checkmap(ubi->volumes[i]); @@ -1099,7 +1102,6 @@ int ubi_detach_mtd_dev(int ubi_num, int anyway) return -EINVAL; spin_lock(&ubi_devices_lock); - put_device(&ubi->dev); ubi->ref_count -= 1; if (ubi->ref_count) { if (!anyway) { @@ -1110,6 +1112,13 @@ int ubi_detach_mtd_dev(int ubi_num, int anyway) ubi_err(ubi, "%s reference count %d, destroy anyway", ubi->ubi_name, ubi->ref_count); } + ubi->is_dead = true; + spin_unlock(&ubi_devices_lock); + + ubi_notify_all(ubi, UBI_VOLUME_SHUTDOWN, NULL); + + spin_lock(&ubi_devices_lock); + put_device(&ubi->dev); ubi_devices[ubi_num] = NULL; spin_unlock(&ubi_devices_lock); diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c index fbf3a7fe2af7..f1ea8677467f 100644 --- a/drivers/mtd/ubi/kapi.c +++ b/drivers/mtd/ubi/kapi.c @@ -152,7 +152,7 @@ struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode) spin_lock(&ubi->volumes_lock); vol = ubi->volumes[vol_id]; - if (!vol) + if (!vol || vol->is_dead) goto out_unlock; err = -EBUSY; diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index a588381c50ad..32009a24869e 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -337,6 +337,7 @@ struct ubi_volume { int writers; int exclusive; int metaonly; + bool is_dead; int reserved_pebs; int vol_type; @@ -561,6 +562,7 @@ struct ubi_device { spinlock_t volumes_lock; int ref_count; int image_seq; + bool is_dead; int rsvd_pebs; int avail_pebs; diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 990571287e84..eaf8328f6fc3 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -59,7 +59,7 @@ static ssize_t vol_attribute_show(struct device *dev, struct ubi_device *ubi = vol->ubi; spin_lock(&ubi->volumes_lock); - if (!ubi->volumes[vol->vol_id]) { + if (!ubi->volumes[vol->vol_id] || ubi->volumes[vol->vol_id]->is_dead) { spin_unlock(&ubi->volumes_lock); return -ENODEV; } @@ -189,7 +189,7 @@ int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req) /* Ensure that the name is unique */ for (i = 0; i < ubi->vtbl_slots; i++) - if (ubi->volumes[i] && + if (ubi->volumes[i] && !ubi->volumes[i]->is_dead && ubi->volumes[i]->name_len == req->name_len && !strcmp(ubi->volumes[i]->name, req->name)) { ubi_err(ubi, "volume \"%s\" exists (ID %d)", @@ -352,6 +352,19 @@ int ubi_remove_volume(struct ubi_volume_desc *desc, int no_vtbl) err = -EBUSY; goto out_unlock; } + + /* + * Mark volume as dead at this point to prevent that anyone + * can take a reference to the volume from now on. + * This is necessary as we have to release the spinlock before + * calling ubi_volume_notify. + */ + vol->is_dead = true; + spin_unlock(&ubi->volumes_lock); + + ubi_volume_notify(ubi, vol, UBI_VOLUME_SHUTDOWN); + + spin_lock(&ubi->volumes_lock); ubi->volumes[vol_id] = NULL; spin_unlock(&ubi->volumes_lock); diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h index a529347fd75b..562f92504f2b 100644 --- a/include/linux/mtd/ubi.h +++ b/include/linux/mtd/ubi.h @@ -192,6 +192,7 @@ struct ubi_device_info { * or a volume was removed) * @UBI_VOLUME_RESIZED: a volume has been re-sized * @UBI_VOLUME_RENAMED: a volume has been re-named + * @UBI_VOLUME_SHUTDOWN: a volume is going to removed, shutdown users * @UBI_VOLUME_UPDATED: data has been written to a volume * * These constants define which type of event has happened when a volume @@ -202,6 +203,7 @@ enum { UBI_VOLUME_REMOVED, UBI_VOLUME_RESIZED, UBI_VOLUME_RENAMED, + UBI_VOLUME_SHUTDOWN, UBI_VOLUME_UPDATED, }; -- cgit v1.2.3 From 51932f9fc4871619e93abf4de6f1282ec23d936c Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:33:38 +0000 Subject: mtd: ubi: populate ubi volume fwnode Look for the 'volumes' subnode of an MTD partition attached to a UBI device and attach matching child nodes to UBI volumes. This allows UBI volumes to be referenced in device tree, e.g. for use as NVMEM providers. Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/vmt.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index eaf8328f6fc3..5a3558bbb903 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -124,6 +124,31 @@ static void vol_release(struct device *dev) kfree(vol); } +static struct fwnode_handle *find_volume_fwnode(struct ubi_volume *vol) +{ + struct fwnode_handle *fw_vols, *fw_vol; + const char *volname; + u32 volid; + + fw_vols = device_get_named_child_node(vol->dev.parent->parent, "volumes"); + if (!fw_vols) + return NULL; + + fwnode_for_each_child_node(fw_vols, fw_vol) { + if (!fwnode_property_read_string(fw_vol, "volname", &volname) && + strncmp(volname, vol->name, vol->name_len)) + continue; + + if (!fwnode_property_read_u32(fw_vol, "volid", &volid) && + vol->vol_id != volid) + continue; + + return fw_vol; + } + + return NULL; +} + /** * ubi_create_volume - create volume. * @ubi: UBI device description object @@ -223,6 +248,7 @@ int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req) vol->name_len = req->name_len; memcpy(vol->name, req->name, vol->name_len); vol->ubi = ubi; + device_set_node(&vol->dev, find_volume_fwnode(vol)); /* * Finish all pending erases because there may be some LEBs belonging @@ -614,6 +640,7 @@ int ubi_add_volume(struct ubi_device *ubi, struct ubi_volume *vol) vol->dev.class = &ubi_class; vol->dev.groups = volume_dev_groups; dev_set_name(&vol->dev, "%s_%d", ubi->ubi_name, vol->vol_id); + device_set_node(&vol->dev, find_volume_fwnode(vol)); err = device_register(&vol->dev); if (err) { cdev_del(&vol->cdev); -- cgit v1.2.3 From 3ce485803da1b79b2692b6d0c2792829292ad838 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:33:48 +0000 Subject: mtd: ubi: provide NVMEM layer over UBI volumes In an ideal world we would like UBI to be used where ever possible on a NAND chip. And with UBI support in ARM Trusted Firmware and U-Boot it is possible to achieve an (almost-)all-UBI flash layout. Hence the need for a way to also use UBI volumes to store board-level constants, such as MAC addresses and calibration data of wireless interfaces. Add UBI volume NVMEM driver module exposing UBI volumes as NVMEM providers. Allow UBI devices to have a "volumes" firmware subnode with volumes which may be compatible with "nvmem-cells". Access to UBI volumes via the NVMEM interface at this point is read-only, and it is slow, opening and closing the UBI volume for each access due to limitations of the NVMEM provider API. Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/Kconfig | 13 ++++ drivers/mtd/ubi/Makefile | 1 + drivers/mtd/ubi/nvmem.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 202 insertions(+) create mode 100644 drivers/mtd/ubi/nvmem.c diff --git a/drivers/mtd/ubi/Kconfig b/drivers/mtd/ubi/Kconfig index 7499a540121e..e28a3af83c0e 100644 --- a/drivers/mtd/ubi/Kconfig +++ b/drivers/mtd/ubi/Kconfig @@ -113,4 +113,17 @@ config MTD_UBI_FAULT_INJECTION testing purposes. If in doubt, say "N". + +config MTD_UBI_NVMEM + tristate "UBI virtual NVMEM" + default n + depends on NVMEM + help + This option enabled an additional driver exposing UBI volumes as NVMEM + providers, intended for platforms where UBI is part of the firmware + specification and used to store also e.g. MAC addresses or board- + specific Wi-Fi calibration data. + + If in doubt, say "N". + endif # MTD_UBI diff --git a/drivers/mtd/ubi/Makefile b/drivers/mtd/ubi/Makefile index 543673605ca7..4b51aaf00d1a 100644 --- a/drivers/mtd/ubi/Makefile +++ b/drivers/mtd/ubi/Makefile @@ -7,3 +7,4 @@ ubi-$(CONFIG_MTD_UBI_FASTMAP) += fastmap.o ubi-$(CONFIG_MTD_UBI_BLOCK) += block.o obj-$(CONFIG_MTD_UBI_GLUEBI) += gluebi.o +obj-$(CONFIG_MTD_UBI_NVMEM) += nvmem.o diff --git a/drivers/mtd/ubi/nvmem.c b/drivers/mtd/ubi/nvmem.c new file mode 100644 index 000000000000..b7a93c495d17 --- /dev/null +++ b/drivers/mtd/ubi/nvmem.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023 Daniel Golle + */ + +/* UBI NVMEM provider */ +#include "ubi.h" +#include +#include + +/* List of all NVMEM devices */ +static LIST_HEAD(nvmem_devices); +static DEFINE_MUTEX(devices_mutex); + +struct ubi_nvmem { + struct nvmem_device *nvmem; + int ubi_num; + int vol_id; + int usable_leb_size; + struct list_head list; +}; + +static int ubi_nvmem_reg_read(void *priv, unsigned int from, + void *val, size_t bytes) +{ + int err = 0, lnum = from, offs, bytes_left = bytes, to_read; + struct ubi_nvmem *unv = priv; + struct ubi_volume_desc *desc; + + desc = ubi_open_volume(unv->ubi_num, unv->vol_id, UBI_READONLY); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + offs = do_div(lnum, unv->usable_leb_size); + while (bytes_left) { + to_read = unv->usable_leb_size - offs; + + if (to_read > bytes_left) + to_read = bytes_left; + + err = ubi_read(desc, lnum, val, offs, to_read); + if (err) + break; + + lnum += 1; + offs = 0; + bytes_left -= to_read; + val += to_read; + } + ubi_close_volume(desc); + + if (err) + return err; + + return bytes_left == 0 ? 0 : -EIO; +} + +static int ubi_nvmem_add(struct ubi_volume_info *vi) +{ + struct device_node *np = dev_of_node(vi->dev); + struct nvmem_config config = {}; + struct ubi_nvmem *unv; + int ret; + + if (!np) + return 0; + + if (!of_get_child_by_name(np, "nvmem-layout")) + return 0; + + if (WARN_ON_ONCE(vi->usable_leb_size <= 0) || + WARN_ON_ONCE(vi->size <= 0)) + return -EINVAL; + + unv = kzalloc(sizeof(struct ubi_nvmem), GFP_KERNEL); + if (!unv) + return -ENOMEM; + + config.id = NVMEM_DEVID_NONE; + config.dev = vi->dev; + config.name = dev_name(vi->dev); + config.owner = THIS_MODULE; + config.priv = unv; + config.reg_read = ubi_nvmem_reg_read; + config.size = vi->usable_leb_size * vi->size; + config.word_size = 1; + config.stride = 1; + config.read_only = true; + config.root_only = true; + config.ignore_wp = true; + config.of_node = np; + + unv->ubi_num = vi->ubi_num; + unv->vol_id = vi->vol_id; + unv->usable_leb_size = vi->usable_leb_size; + unv->nvmem = nvmem_register(&config); + if (IS_ERR(unv->nvmem)) { + ret = dev_err_probe(vi->dev, PTR_ERR(unv->nvmem), + "Failed to register NVMEM device\n"); + kfree(unv); + return ret; + } + + mutex_lock(&devices_mutex); + list_add_tail(&unv->list, &nvmem_devices); + mutex_unlock(&devices_mutex); + + return 0; +} + +static void ubi_nvmem_remove(struct ubi_volume_info *vi) +{ + struct ubi_nvmem *unv_c, *unv = NULL; + + mutex_lock(&devices_mutex); + list_for_each_entry(unv_c, &nvmem_devices, list) + if (unv_c->ubi_num == vi->ubi_num && unv_c->vol_id == vi->vol_id) { + unv = unv_c; + break; + } + + if (!unv) { + mutex_unlock(&devices_mutex); + return; + } + + list_del(&unv->list); + mutex_unlock(&devices_mutex); + nvmem_unregister(unv->nvmem); + kfree(unv); +} + +/** + * nvmem_notify - UBI notification handler. + * @nb: registered notifier block + * @l: notification type + * @ns_ptr: pointer to the &struct ubi_notification object + */ +static int nvmem_notify(struct notifier_block *nb, unsigned long l, + void *ns_ptr) +{ + struct ubi_notification *nt = ns_ptr; + + switch (l) { + case UBI_VOLUME_RESIZED: + ubi_nvmem_remove(&nt->vi); + fallthrough; + case UBI_VOLUME_ADDED: + ubi_nvmem_add(&nt->vi); + break; + case UBI_VOLUME_SHUTDOWN: + ubi_nvmem_remove(&nt->vi); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block nvmem_notifier = { + .notifier_call = nvmem_notify, +}; + +static int __init ubi_nvmem_init(void) +{ + return ubi_register_volume_notifier(&nvmem_notifier, 0); +} + +static void __exit ubi_nvmem_exit(void) +{ + struct ubi_nvmem *unv, *tmp; + + mutex_lock(&devices_mutex); + list_for_each_entry_safe(unv, tmp, &nvmem_devices, list) { + nvmem_unregister(unv->nvmem); + list_del(&unv->list); + kfree(unv); + } + mutex_unlock(&devices_mutex); + + ubi_unregister_volume_notifier(&nvmem_notifier); +} + +module_init(ubi_nvmem_init); +module_exit(ubi_nvmem_exit); +MODULE_DESCRIPTION("NVMEM layer over UBI volumes"); +MODULE_AUTHOR("Daniel Golle"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From b8a77b9a5f9c2ba313f2beef8440b6f9f69768e7 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Thu, 29 Feb 2024 03:47:24 +0000 Subject: mtd: ubi: fix NVMEM over UBI volumes on 32-bit systems A compiler warning related to sizeof(int) != 8 when calling do_div() is triggered when building on 32-bit platforms. Address this by using integer types having a well-defined size. Fixes: 3ce485803da1 ("mtd: ubi: provide NVMEM layer over UBI volumes") Signed-off-by: Daniel Golle Reviewed-by: Zhihao Cheng Tested-by: Randy Dunlap Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/nvmem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/nvmem.c b/drivers/mtd/ubi/nvmem.c index b7a93c495d17..8aeb9c428e51 100644 --- a/drivers/mtd/ubi/nvmem.c +++ b/drivers/mtd/ubi/nvmem.c @@ -23,9 +23,12 @@ struct ubi_nvmem { static int ubi_nvmem_reg_read(void *priv, unsigned int from, void *val, size_t bytes) { - int err = 0, lnum = from, offs, bytes_left = bytes, to_read; + size_t to_read, bytes_left = bytes; struct ubi_nvmem *unv = priv; struct ubi_volume_desc *desc; + uint32_t offs; + uint64_t lnum = from; + int err = 0; desc = ubi_open_volume(unv->ubi_num, unv->vol_id, UBI_READONLY); if (IS_ERR(desc)) -- cgit v1.2.3