294 files changed, 4304 insertions, 31610 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 89fdbefd1075..4bc7dd420874 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -162,7 +162,6 @@ menu "DOS/FAT/EXFAT/NT Filesystems"
 
 source "fs/fat/Kconfig"
 source "fs/exfat/Kconfig"
-source "fs/ntfs/Kconfig"
 source "fs/ntfs3/Kconfig"
 
 endmenu
@@ -174,6 +173,13 @@ source "fs/proc/Kconfig"
 source "fs/kernfs/Kconfig"
 source "fs/sysfs/Kconfig"
 
+config FS_PID
+	bool "Pseudo filesystem for process file descriptors"
+	depends on 64BIT
+	default y
+	help
+	  Pidfs implements advanced features for process file descriptors.
+
 config TMPFS
 	bool "Tmpfs virtual memory file system support (former shm fs)"
 	depends on SHMEM
diff --git a/fs/Makefile b/fs/Makefile
index c09016257f05..6ecc9b0a53f2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
-		kernel_read_file.o mnt_idmapping.o remap_range.o
+		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
 
 obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
@@ -91,7 +91,6 @@ obj-y				+= unicode/
 obj-$(CONFIG_SYSV_FS)		+= sysv/
 obj-$(CONFIG_SMBFS)		+= smb/
 obj-$(CONFIG_HPFS_FS)		+= hpfs/
-obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_NTFS3_FS)		+= ntfs3/
 obj-$(CONFIG_UFS_FS)		+= ufs/
 obj-$(CONFIG_EFS_FS)		+= efs/
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 60685ec76d98..2e612834329a 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -105,6 +105,7 @@ struct affs_sb_info {
 	int work_queued;		/* non-zero delayed work is queued */
 	struct delayed_work sb_work;	/* superblock flush delayed work */
 	spinlock_t work_lock;		/* protects sb_work and work_queued */
+	struct rcu_head rcu;
 };
 
 #define AFFS_MOUNT_SF_INTL		0x0001 /* International filesystem. */
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 58b391446ae1..b56a95cf414a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -640,7 +640,7 @@ static void affs_kill_sb(struct super_block *sb)
 		affs_brelse(sbi->s_root_bh);
 		kfree(sbi->s_prefix);
 		mutex_destroy(&sbi->s_bmlock);
-		kfree(sbi);
+		kfree_rcu(sbi, rcu);
 	}
 }
 
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index c14533ef108f..8a67fc427e74 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req)
 		if (xas_retry(&xas, folio))
 			continue;
 		BUG_ON(xa_is_value(folio));
-		ASSERTCMP(folio_file_mapping(folio), ==, mapping);
+		ASSERTCMP(folio->mapping, ==, mapping);
 
 		folio_put(folio);
 	}
@@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
 		if (xas_retry(&xas, folio))
 			continue;
 
-		BUG_ON(folio_file_mapping(folio) != mapping);
+		BUG_ON(folio->mapping != mapping);
 
 		size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio));
 		for (offset = 0; offset < size; offset += sizeof(*block)) {
 			block = kmap_local_folio(folio, offset);
-			pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block);
+			pr_warn("[%02lx] %32phN\n", folio->index + offset, block);
 			kunmap_local(block);
 		}
 	}
@@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
 		if (xas_retry(&xas, folio))
 			continue;
 
-		BUG_ON(folio_file_mapping(folio) != mapping);
+		BUG_ON(folio->mapping != mapping);
 
 		if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) {
 			afs_dir_dump(dvnode, req);
@@ -474,6 +474,16 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
 			continue;
 		}
 
+		/* Don't expose silly rename entries to userspace. */
+		if (nlen > 6 &&
+		    dire->u.name[0] == '.' &&
+		    ctx->actor != afs_lookup_filldir &&
+		    ctx->actor != afs_lookup_one_filldir &&
+		    memcmp(dire->u.name, ".__afs", 6) == 0) {
+			ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
+			continue;
+		}
+
 		/* found the next entry */
 		if (!dir_emit(ctx, dire->u.name, nlen,
 			      ntohl(dire->u.vnode),
@@ -708,6 +718,8 @@ static void afs_do_lookup_success(struct afs_operation *op)
 			break;
 		}
 
+		if (vp->scb.status.abort_code)
+			trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code);
 		if (!vp->scb.have_status && !vp->scb.have_error)
 			continue;
 
@@ -897,12 +909,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
 		afs_begin_vnode_operation(op);
 		afs_wait_for_operation(op);
 	}
-	inode = ERR_PTR(afs_op_error(op));
 
 out_op:
 	if (!afs_op_error(op)) {
-		inode = &op->file[1].vnode->netfs.inode;
-		op->file[1].vnode = NULL;
+		if (op->file[1].scb.status.abort_code) {
+			afs_op_accumulate_error(op, -ECONNABORTED,
+						op->file[1].scb.status.abort_code);
+		} else {
+			inode = &op->file[1].vnode->netfs.inode;
+			op->file[1].vnode = NULL;
+		}
 	}
 
 	if (op->file[0].scb.have_status)
@@ -2022,7 +2038,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
 {
 	struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
 
-	_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
+	_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index);
 
 	folio_detach_private(folio);
 
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index d3bc4a2d7085..c4d2711e20ad 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = {
 	.lookup		= afs_dynroot_lookup,
 };
 
-/*
- * Dirs in the dynamic root don't need revalidation.
- */
-static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	return 1;
-}
-
 const struct dentry_operations afs_dynroot_dentry_operations = {
-	.d_revalidate	= afs_dynroot_d_revalidate,
 	.d_delete	= always_delete_dentry,
 	.d_release	= afs_d_release,
 	.d_automount	= afs_d_automount,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 3d33b221d9ca..ef2cc8f565d2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -417,13 +417,17 @@ static void afs_add_open_mmap(struct afs_vnode *vnode)
 
 static void afs_drop_open_mmap(struct afs_vnode *vnode)
 {
-	if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
+	if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1))
 		return;
 
 	down_write(&vnode->volume->open_mmaps_lock);
 
-	if (atomic_read(&vnode->cb_nr_mmap) == 0)
+	read_seqlock_excl(&vnode->cb_lock);
+	// the only place where ->cb_nr_mmap may hit 0
+	// see __afs_break_callback() for the other side...
+	if (atomic_dec_and_test(&vnode->cb_nr_mmap))
 		list_del_init(&vnode->cb_mmap_link);
+	read_sequnlock_excl(&vnode->cb_lock);
 
 	up_write(&vnode->volume->open_mmaps_lock);
 	flush_work(&vnode->cb_work);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 9c03fcf7ffaa..6ce5a612937c 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -321,8 +321,7 @@ struct afs_net {
 	struct list_head	fs_probe_slow;	/* List of afs_server to probe at 5m intervals */
 	struct hlist_head	fs_proc;	/* procfs servers list */
 
-	struct hlist_head	fs_addresses4;	/* afs_server (by lowest IPv4 addr) */
-	struct hlist_head	fs_addresses6;	/* afs_server (by lowest IPv6 addr) */
+	struct hlist_head	fs_addresses;	/* afs_server (by lowest IPv6 addr) */
 	seqlock_t		fs_addr_lock;	/* For fs_addresses[46] */
 
 	struct work_struct	fs_manager;
@@ -561,8 +560,7 @@ struct afs_server {
 	struct afs_server __rcu	*uuid_next;	/* Next server with same UUID */
 	struct afs_server	*uuid_prev;	/* Previous server with same UUID */
 	struct list_head	probe_link;	/* Link in net->fs_probe_list */
-	struct hlist_node	addr4_link;	/* Link in net->fs_addresses4 */
-	struct hlist_node	addr6_link;	/* Link in net->fs_addresses6 */
+	struct hlist_node	addr_link;	/* Link in net->fs_addresses6 */
 	struct hlist_node	proc_link;	/* Link in net->fs_proc */
 	struct list_head	volumes;	/* RCU list of afs_server_entry objects */
 	struct afs_server	*gc_next;	/* Next server in manager's list */
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 1b3bd21c168a..a14f6013e316 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -90,8 +90,7 @@ static int __net_init afs_net_init(struct net *net_ns)
 	INIT_LIST_HEAD(&net->fs_probe_slow);
 	INIT_HLIST_HEAD(&net->fs_proc);
 
-	INIT_HLIST_HEAD(&net->fs_addresses4);
-	INIT_HLIST_HEAD(&net->fs_addresses6);
+	INIT_HLIST_HEAD(&net->fs_addresses);
 	seqlock_init(&net->fs_addr_lock);
 
 	INIT_WORK(&net->fs_manager, afs_manage_servers);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 3bd02571f30d..15eab053af6d 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -166,7 +166,7 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
 
 	if (!preflist) {
 		seq_puts(m, "NO PREFS\n");
-		return 0;
+		goto out;
 	}
 
 	seq_printf(m, "PROT SUBNET                                      PRIOR (v=%u n=%u/%u/%u)\n",
@@ -191,7 +191,8 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
 		}
 	}
 
-	rcu_read_lock();
+out:
+	rcu_read_unlock();
 	return 0;
 }
 
diff --git a/fs/afs/server.c b/fs/afs/server.c
index e169121f603e..038f9d0ae3af 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -38,7 +38,7 @@ struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer
 		seq++; /* 2 on the 1st/lockless path, otherwise odd */
 		read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
 
-		hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+		hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) {
 			estate = rcu_dereference(server->endpoint_state);
 			alist = estate->addresses;
 			for (i = 0; i < alist->nr_addrs; i++)
@@ -177,10 +177,8 @@ added_dup:
 	 * bit, but anything we might want to do gets messy and memory
 	 * intensive.
 	 */
-	if (alist->nr_ipv4 > 0)
-		hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4);
-	if (alist->nr_addrs > alist->nr_ipv4)
-		hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6);
+	if (alist->nr_addrs > 0)
+		hlist_add_head_rcu(&server->addr_link, &net->fs_addresses);
 
 	write_sequnlock(&net->fs_addr_lock);
 
@@ -511,10 +509,8 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
 
 			list_del(&server->probe_link);
 			hlist_del_rcu(&server->proc_link);
-			if (!hlist_unhashed(&server->addr4_link))
-				hlist_del_rcu(&server->addr4_link);
-			if (!hlist_unhashed(&server->addr6_link))
-				hlist_del_rcu(&server->addr6_link);
+			if (!hlist_unhashed(&server->addr_link))
+				hlist_del_rcu(&server->addr_link);
 		}
 		write_sequnlock(&net->fs_lock);
 
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 020ecd45e476..af3a3f57c1b3 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -353,7 +353,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
 {
 	struct afs_server_list *new, *old, *discard;
 	struct afs_vldb_entry *vldb;
-	char idbuf[16];
+	char idbuf[24];
 	int ret, idsz;
 
 	_enter("");
@@ -361,7 +361,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
 	/* We look up an ID by passing it as a decimal string in the
 	 * operation's name parameter.
 	 */
-	idsz = sprintf(idbuf, "%llu", volume->vid);
+	idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid);
 
 	vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz);
 	if (IS_ERR(vldb)) {
diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..9cdaa2faa536 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -589,13 +589,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 
 void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 {
-	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
-	struct kioctx *ctx = req->ki_ctx;
+	struct aio_kiocb *req;
+	struct kioctx *ctx;
 	unsigned long flags;
 
+	/*
+	 * kiocb didn't come from aio or is neither a read nor a write, hence
+	 * ignore it.
+	 */
+	if (!(iocb->ki_flags & IOCB_AIO_RW))
+		return;
+
+	req = container_of(iocb, struct aio_kiocb, rw);
+
 	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
 		return;
 
+	ctx = req->ki_ctx;
+
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 	list_add_tail(&req->ki_list, &ctx->active_reqs);
 	req->ki_cancel = cancel;
@@ -1509,7 +1520,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	req->ki_complete = aio_complete_rw;
 	req->private = NULL;
 	req->ki_pos = iocb->aio_offset;
-	req->ki_flags = req->ki_filp->f_iocb_flags;
+	req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
 	if (iocb->aio_flags & IOCB_FLAG_RESFD)
 		req->ki_flags |= IOCB_EVENTFD;
 	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/fs/attr.c b/fs/attr.c
index 5a13f0c8495f..49d23b5dbab4 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -352,7 +352,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
 EXPORT_SYMBOL(may_setattr);
 
 /**
- * notify_change - modify attributes of a filesytem object
+ * notify_change - modify attributes of a filesystem object
  * @idmap:	idmap of the mount the inode was found from
  * @dentry:	object affected
  * @attr:	new attributes
diff --git a/fs/backing-file.c b/fs/backing-file.c
index a681f38d84d8..740185198db3 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap);
 
 static int __init backing_aio_init(void)
 {
-	backing_aio_cachep = kmem_cache_create("backing_aio",
-					       sizeof(struct backing_aio),
-					       0, SLAB_HWCACHE_ALIGN, NULL);
+	backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN);
 	if (!backing_aio_cachep)
 		return -ENOMEM;
 
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 10704f2d3af5..fd3e175d8342 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1715,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 		 * This works without any other locks because this is the only
 		 * thread that removes items from the need_discard tree
 		 */
-		bch2_trans_unlock(trans);
+		bch2_trans_unlock_long(trans);
 		blkdev_issue_discard(ca->disk_sb.bdev,
 				     k.k->p.offset * ca->mi.bucket_size,
 				     ca->mi.bucket_size,
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index b4dc319bcb2b..569b97904da4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -68,9 +68,11 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
 
 void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
-	prt_str(out, "bucket=");
-	bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
-	prt_str(out, " ");
+	if (bch2_dev_exists2(c, k.k->p.inode)) {
+		prt_str(out, "bucket=");
+		bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+		prt_str(out, " ");
+	}
 
 	bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
 }
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b80c6c9efd8c..69d0d60d50e3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1249,6 +1249,18 @@ static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
 	return stdio;
 }
 
+static inline unsigned metadata_replicas_required(struct bch_fs *c)
+{
+	return min(c->opts.metadata_replicas,
+		   c->opts.metadata_replicas_required);
+}
+
+static inline unsigned data_replicas_required(struct bch_fs *c)
+{
+	return min(c->opts.data_replicas,
+		   c->opts.data_replicas_required);
+}
+
 #define BKEY_PADDED_ONSTACK(key, pad)				\
 	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5467a8635be1..3ef338df82f5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2156,7 +2156,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 		 * isn't monotonically increasing before FILTER_SNAPSHOTS, and
 		 * that's what we check against in extents mode:
 		 */
-		if (k.k->p.inode > end.inode)
+		if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+			     ? bkey_gt(k.k->p, end)
+			     : k.k->p.inode > end.inode))
 			goto end;
 
 		if (iter->update_path &&
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index bed75c93c069..684397442338 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -92,7 +92,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
 			continue;
 
 		bch2_btree_trans_to_text(out, i->trans);
-		bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1);
+		bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
 	}
 }
 
@@ -227,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
 			prt_printf(&buf, "backtrace:");
 			prt_newline(&buf);
 			printbuf_indent_add(&buf, 2);
-			bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2);
+			bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
 			printbuf_indent_sub(&buf, 2);
 			prt_newline(&buf);
 		}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 17a5938aa71a..4530b14ff2c3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -280,7 +280,8 @@ retry:
 				      writepoint_ptr(&c->btree_write_point),
 				      &devs_have,
 				      res->nr_replicas,
-				      c->opts.metadata_replicas_required,
+				      min(res->nr_replicas,
+					  c->opts.metadata_replicas_required),
 				      watermark, 0, cl, &wp);
 	if (unlikely(ret))
 		return ERR_PTR(ret);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index cadda9bbe4a4..7bdba8507fc9 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -627,7 +627,7 @@ restart:
 		prt_printf(&i->buf, "backtrace:");
 		prt_newline(&i->buf);
 		printbuf_indent_add(&i->buf, 2);
-		bch2_prt_task_backtrace(&i->buf, task, 0);
+		bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
 		printbuf_indent_sub(&i->buf, 2);
 		prt_newline(&i->buf);
 
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 73c12e565af5..27710cdd5710 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -303,18 +303,6 @@ void bch2_readahead(struct readahead_control *ractl)
 	darray_exit(&readpages_iter.folios);
 }
 
-static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
-			     subvol_inum inum, struct folio *folio)
-{
-	bch2_folio_create(folio, __GFP_NOFAIL);
-
-	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
-	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-	bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
-}
-
 static void bch2_read_single_folio_end_io(struct bio *bio)
 {
 	complete(bio->bi_private);
@@ -329,6 +317,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
 	int ret;
 	DECLARE_COMPLETION_ONSTACK(done);
 
+	if (!bch2_folio_create(folio, GFP_KERNEL))
+		return -ENOMEM;
+
 	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
 
 	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
@@ -336,7 +327,11 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
 	rbio->bio.bi_private = &done;
 	rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
 
-	__bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+	bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
 	wait_for_completion(&done);
 
 	ret = blk_status_to_errno(rbio->bio.bi_status);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index e3b219e19e10..33cb6da3a5ad 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -88,6 +88,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 		return ret;
 
 	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+	if (shorten >= iter->count)
+		shorten = 0;
 	iter->count -= shorten;
 
 	bio = bio_alloc_bioset(NULL,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index dc52918d06ef..8c70123b6a0c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
 			continue;
 
 		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
-						    REQ_OP_FLUSH,
+						    REQ_OP_WRITE|REQ_PREFLUSH,
 						    GFP_KERNEL,
 						    &c->nocow_flush_bioset),
 				   struct nocow_flush, bio);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 3a4c24c28e7f..3dc8630ff9fe 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -455,6 +455,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 	if (IS_ERR(victim))
 		return PTR_ERR(victim);
 
+	dir = d_inode(path.dentry);
 	if (victim->d_sb->s_fs_info != c) {
 		ret = -EXDEV;
 		goto err;
@@ -463,14 +464,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 		ret = -ENOENT;
 		goto err;
 	}
-	dir = d_inode(path.dentry);
 	ret = __bch2_unlink(dir, victim, true);
 	if (!ret) {
 		fsnotify_rmdir(dir, victim);
 		d_delete(victim);
 	}
-	inode_unlock(dir);
 err:
+	inode_unlock(dir);
 	dput(victim);
 	path_put(&path);
 	return ret;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ec419b8e2c43..77ae65542db9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -435,7 +435,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
 		__bch2_link(c, inode, dir, dentry);
 	if (unlikely(ret))
-		return ret;
+		return bch2_err_class(ret);
 
 	ihold(&inode->v);
 	d_instantiate(dentry, &inode->v);
@@ -487,8 +487,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	struct bch_inode_info *dir= to_bch_ei(vdir);
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
 
-	return bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+	int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
 		__bch2_unlink(vdir, dentry, false);
+	return bch2_err_class(ret);
 }
 
 static int bch2_symlink(struct mnt_idmap *idmap,
@@ -523,7 +524,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
 	return 0;
 err:
 	iput(&inode->v);
-	return ret;
+	return bch2_err_class(ret);
 }
 
 static int bch2_mkdir(struct mnt_idmap *idmap,
@@ -641,7 +642,7 @@ err:
 			   src_inode,
 			   dst_inode);
 
-	return ret;
+	return bch2_err_class(ret);
 }
 
 static void bch2_setattr_copy(struct mnt_idmap *idmap,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 4f0ecd605675..6a760777bafb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -119,22 +119,19 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	if (!ret)
 		*snapshot = iter.pos.snapshot;
 err:
-	bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
-static int __lookup_dirent(struct btree_trans *trans,
+static int lookup_dirent_in_snapshot(struct btree_trans *trans,
 			   struct bch_hash_info hash_info,
 			   subvol_inum dir, struct qstr *name,
-			   u64 *target, unsigned *type)
+			   u64 *target, unsigned *type, u32 snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c_dirent d;
-	int ret;
-
-	ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
-			       &hash_info, dir, name, 0);
+	int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+			       &hash_info, dir, name, 0, snapshot);
 	if (ret)
 		return ret;
 
@@ -225,15 +222,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 
 	struct bch_inode_unpacked root_inode;
 	struct bch_hash_info root_hash_info;
-	ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot);
+	u32 root_inode_snapshot = snapshot;
+	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
 	bch_err_msg(c, ret, "looking up root inode");
 	if (ret)
 		return ret;
 
 	root_hash_info = bch2_hash_info_init(c, &root_inode);
 
-	ret = __lookup_dirent(trans, root_hash_info, root_inum,
-			      &lostfound_str, &inum, &d_type);
+	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
+			      &lostfound_str, &inum, &d_type, snapshot);
 	if (bch2_err_matches(ret, ENOENT))
 		goto create_lostfound;
 
@@ -250,7 +248,10 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 	 * The bch2_check_dirents pass has already run, dangling dirents
 	 * shouldn't exist here:
 	 */
-	return lookup_inode(trans, inum, lostfound, &snapshot);
+	ret = lookup_inode(trans, inum, lostfound, &snapshot);
+	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
+		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
+	return ret;
 
 create_lostfound:
 	/*
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index ef3a53f9045a..2c098ac017b3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1564,6 +1564,7 @@ CLOSURE_CALLBACK(bch2_write)
 	BUG_ON(!op->write_point.v);
 	BUG_ON(bkey_eq(op->pos, POS_MAX));
 
+	op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
 	op->start_time = local_clock();
 	bch2_keylist_init(&op->insert_keys, op->inline_keys);
 	wbio_init(bio)->put_bio = false;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index d71d26e39521..bc890776eb57 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -233,7 +233,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
 		prt_str(&pbuf, "entry size: ");
 		prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
 		prt_newline(&pbuf);
-		bch2_prt_task_backtrace(&pbuf, current, 1);
+		bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
 		trace_journal_entry_close(c, pbuf.buf);
 		printbuf_exit(&pbuf);
 	}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 04a1e79a5ed3..47805193f18c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1478,6 +1478,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 		c->opts.foreground_target;
 	unsigned i, replicas = 0, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);
+	unsigned replicas_need = min_t(unsigned, replicas_want,
+				       READ_ONCE(c->opts.metadata_replicas_required));
 
 	rcu_read_lock();
 retry:
@@ -1526,7 +1528,7 @@ done:
 
 	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
 
-	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
+	return replicas >= replicas_need ? 0 : -EROFS;
 }
 
 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@@ -1988,7 +1990,8 @@ CLOSURE_CALLBACK(bch2_journal_write)
 			percpu_ref_get(&ca->io_ref);
 
 			bio = ca->journal.bio;
-			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+			bio_reset(bio, ca->disk_sb.bdev,
+				  REQ_OP_WRITE|REQ_PREFLUSH);
 			bio->bi_end_io		= journal_write_endio;
 			bio->bi_private		= ca;
 			closure_bio_submit(bio, cl);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 820d25e19e5f..c33dca641575 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -205,7 +205,7 @@ void bch2_journal_space_available(struct journal *j)
 
 	j->can_discard = can_discard;
 
-	if (nr_online < c->opts.metadata_replicas_required) {
+	if (nr_online < metadata_replicas_required(c)) {
 		ret = JOURNAL_ERR_insufficient_devices;
 		goto out;
 	}
@@ -892,9 +892,11 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 					 journal_seq_pin(j, seq)->devs);
 		seq++;
 
-		spin_unlock(&j->lock);
-		ret = bch2_mark_replicas(c, &replicas.e);
-		spin_lock(&j->lock);
+		if (replicas.e.nr_devs) {
+			spin_unlock(&j->lock);
+			ret = bch2_mark_replicas(c, &replicas.e);
+			spin_lock(&j->lock);
+		}
 	}
 	spin_unlock(&j->lock);
 err:
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index b2be565bb8f2..64df11ab422b 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -17,7 +17,7 @@
  * Rust and rustc has issues with u128.
  */
 
-#if defined(__SIZEOF_INT128__) && defined(__KERNEL__)
+#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
 
 typedef struct {
 	unsigned __int128 v;
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index accf246c3233..b27d22925929 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -56,6 +56,7 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
 
 		va_copy(args2, args);
 		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+		va_end(args2);
 	} while (len + 1 >= printbuf_remaining(out) &&
 		 !bch2_printbuf_make_room(out, len + 1));
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9127d0e3ca2f..21e13bb4335b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -577,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v)
 
 static bool check_version_upgrade(struct bch_fs *c)
 {
-	unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
 	unsigned latest_version	= bcachefs_metadata_version_current;
+	unsigned latest_compatible = min(latest_version,
+					 bch2_latest_compatible_version(c->sb.version));
 	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
 	unsigned new_version = 0;
 
@@ -597,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c)
 			new_version = latest_version;
 			break;
 		case BCH_VERSION_UPGRADE_none:
-			new_version = old_version;
+			new_version = min(old_version, latest_version);
 			break;
 		}
 	}
@@ -774,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (!(c->opts.nochanges && c->opts.norecovery)) {
+	if (!c->opts.nochanges) {
 		mutex_lock(&c->sb_lock);
 		bool write_sb = false;
 
@@ -804,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		if (bch2_check_version_downgrade(c)) {
 			struct printbuf buf = PRINTBUF;
 
-			prt_str(&buf, "Version downgrade required:\n");
+			prt_str(&buf, "Version downgrade required:");
 
 			__le64 passes = ext->recovery_passes_required[0];
 			bch2_sb_set_downgrade(c,
@@ -812,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 					BCH_VERSION_MINOR(c->sb.version));
 			passes = ext->recovery_passes_required[0] & ~passes;
 			if (passes) {
-				prt_str(&buf, "  running recovery passes: ");
+				prt_str(&buf, "\n  running recovery passes: ");
 				prt_bitflags(&buf, bch2_recovery_passes,
 					     bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
 			}
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index a45354d2acde..eff5ce18c69c 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -421,7 +421,7 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
 	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 	for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
 		m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
-	m->errors_reset_time = ktime_get_real_seconds();
+	m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 45f67e8b29eb..ac6ba04d5521 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -728,7 +728,7 @@ static int check_snapshot(struct btree_trans *trans,
 		return 0;
 
 	memset(&s, 0, sizeof(s));
-	memcpy(&s, k.v, bkey_val_bytes(k.k));
+	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
 
 	id = le32_to_cpu(s.parent);
 	if (id) {
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 89fdb7c21134..fcaa5a888744 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -160,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
 }
 
 static __always_inline int
-bch2_hash_lookup(struct btree_trans *trans,
+bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
 		 struct btree_iter *iter,
 		 const struct bch_hash_desc desc,
 		 const struct bch_hash_info *info,
 		 subvol_inum inum, const void *key,
-		 unsigned flags)
+		 unsigned flags, u32 snapshot)
 {
 	struct bkey_s_c k;
-	u32 snapshot;
 	int ret;
 
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		return ret;
-
 	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
 			   POS(inum.inum, U64_MAX),
@@ -195,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans,
 }
 
 static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+		 struct btree_iter *iter,
+		 const struct bch_hash_desc desc,
+		 const struct bch_hash_info *info,
+		 subvol_inum inum, const void *key,
+		 unsigned flags)
+{
+	u32 snapshot;
+	return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+		bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+}
+
+static __always_inline int
 bch2_hash_hole(struct btree_trans *trans,
 	       struct btree_iter *iter,
 	       const struct bch_hash_desc desc,
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index d60c7d27a047..36988add581f 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -717,7 +717,7 @@ retry:
 
 	if (IS_ERR(sb->bdev_handle)) {
 		ret = PTR_ERR(sb->bdev_handle);
-		goto out;
+		goto err;
 	}
 	sb->bdev = sb->bdev_handle->bdev;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b9911402b175..6b23e11825e6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1428,10 +1428,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 
 		required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
 			       ? c->opts.metadata_replicas
-			       : c->opts.metadata_replicas_required,
+			       : metadata_replicas_required(c),
 			       !(flags & BCH_FORCE_IF_DATA_DEGRADED)
 			       ? c->opts.data_replicas
-			       : c->opts.data_replicas_required);
+			       : data_replicas_required(c));
 
 		return nr_rw >= required;
 	case BCH_MEMBER_STATE_failed:
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index b1c867aa2b58..9220d7de10db 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -53,9 +53,9 @@ int bch2_run_thread_with_file(struct thread_with_file *thr,
 	if (ret)
 		goto err;
 
-	fd_install(fd, file);
 	get_task_struct(thr->task);
 	wake_up_process(thr->task);
+	fd_install(fd, file);
 	return fd;
 err:
 	if (fd >= 0)
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index a135136adeee..3a32faa86b5c 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -272,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
 	console_unlock();
 }
 
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr)
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
+			gfp_t gfp)
 {
 #ifdef CONFIG_STACKTRACE
 	unsigned nr_entries = 0;
-	int ret = 0;
 
 	stack->nr = 0;
-	ret = darray_make_room(stack, 32);
+	int ret = darray_make_room_gfp(stack, 32, gfp);
 	if (ret)
 		return ret;
 
@@ -289,7 +289,7 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne
 	do {
 		nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
 	} while (nr_entries == stack->size &&
-		 !(ret = darray_make_room(stack, stack->size * 2)));
+		 !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
 
 	stack->nr = nr_entries;
 	up_read(&task->signal->exec_update_lock);
@@ -308,10 +308,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
 	}
 }
 
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr)
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
 {
 	bch_stacktrace stack = { 0 };
-	int ret = bch2_save_backtrace(&stack, task, skipnr + 1);
+	int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
 
 	bch2_prt_backtrace(out, &stack);
 	darray_exit(&stack);
@@ -418,14 +418,15 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
 		bch2_quantiles_update(&stats->quantiles, duration);
 	}
 
-	if (time_after64(end, stats->last_event)) {
+	if (stats->last_event && time_after64(end, stats->last_event)) {
 		freq = end - stats->last_event;
 		mean_and_variance_update(&stats->freq_stats, freq);
 		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
 		stats->max_freq = max(stats->max_freq, freq);
 		stats->min_freq = min(stats->min_freq, freq);
-		stats->last_event = end;
 	}
+
+	stats->last_event = end;
 }
 
 static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index df67bf55fe2b..b414736d59a5 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -348,9 +348,9 @@ void bch2_prt_u64_base2(struct printbuf *, u64);
 void bch2_print_string_as_lines(const char *prefix, const char *lines);
 
 typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned);
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
 void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
 
 static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
 {
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a9be9ac99222..378d9103a207 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1455,6 +1455,7 @@ out:
  */
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 {
+	LIST_HEAD(retry_list);
 	struct btrfs_block_group *block_group;
 	struct btrfs_space_info *space_info;
 	struct btrfs_trans_handle *trans;
@@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 
 	spin_lock(&fs_info->unused_bgs_lock);
 	while (!list_empty(&fs_info->unused_bgs)) {
+		u64 used;
 		int trimming;
 
 		block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 			goto next;
 		}
 
+		spin_lock(&space_info->lock);
 		spin_lock(&block_group->lock);
-		if (block_group->reserved || block_group->pinned ||
-		    block_group->used || block_group->ro ||
+		if (btrfs_is_block_group_used(block_group) || block_group->ro ||
 		    list_is_singular(&block_group->list)) {
 			/*
 			 * We want to bail if we made new allocations or have
@@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 			 */
 			trace_btrfs_skip_unused_block_group(block_group);
 			spin_unlock(&block_group->lock);
+			spin_unlock(&space_info->lock);
 			up_write(&space_info->groups_sem);
 			goto next;
 		}
+
+		/*
+		 * The block group may be unused but there may be space reserved
+		 * accounting with the existence of that block group, that is,
+		 * space_info->bytes_may_use was incremented by a task but no
+		 * space was yet allocated from the block group by the task.
+		 * That space may or may not be allocated, as we are generally
+		 * pessimistic about space reservation for metadata as well as
+		 * for data when using compression (as we reserve space based on
+		 * the worst case, when data can't be compressed, and before
+		 * actually attempting compression, before starting writeback).
+		 *
+		 * So check if the total space of the space_info minus the size
+		 * of this block group is less than the used space of the
+		 * space_info - if that's the case, then it means we have tasks
+		 * that might be relying on the block group in order to allocate
+		 * extents, and add back the block group to the unused list when
+		 * we finish, so that we retry later in case no tasks ended up
+		 * needing to allocate extents from the block group.
+		 */
+		used = btrfs_space_info_used(space_info, true);
+		if (space_info->total_bytes - block_group->length < used) {
+			/*
+			 * Add a reference for the list, compensate for the ref
+			 * drop under the "next" label for the
+			 * fs_info->unused_bgs list.
+			 */
+			btrfs_get_block_group(block_group);
+			list_add_tail(&block_group->bg_list, &retry_list);
+
+			trace_btrfs_skip_unused_block_group(block_group);
+			spin_unlock(&block_group->lock);
+			spin_unlock(&space_info->lock);
+			up_write(&space_info->groups_sem);
+			goto next;
+		}
+
 		spin_unlock(&block_group->lock);
+		spin_unlock(&space_info->lock);
 
 		/* We don't want to force the issue, only flip if it's ok. */
 		ret = inc_block_group_ro(block_group, 0);
@@ -1650,12 +1691,16 @@ next:
 		btrfs_put_block_group(block_group);
 		spin_lock(&fs_info->unused_bgs_lock);
 	}
+	list_splice_tail(&retry_list, &fs_info->unused_bgs);
 	spin_unlock(&fs_info->unused_bgs_lock);
 	mutex_unlock(&fs_info->reclaim_bgs_lock);
 	return;
 
 flip_async:
 	btrfs_end_transaction(trans);
+	spin_lock(&fs_info->unused_bgs_lock);
+	list_splice_tail(&retry_list, &fs_info->unused_bgs);
+	spin_unlock(&fs_info->unused_bgs_lock);
 	mutex_unlock(&fs_info->reclaim_bgs_lock);
 	btrfs_put_block_group(block_group);
 	btrfs_discard_punt_unused_bgs_list(fs_info);
@@ -2684,6 +2729,37 @@ next:
 		btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
 		list_del_init(&block_group->bg_list);
 		clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+
+		/*
+		 * If the block group is still unused, add it to the list of
+		 * unused block groups. The block group may have been created in
+		 * order to satisfy a space reservation, in which case the
+		 * extent allocation only happens later. But often we don't
+		 * actually need to allocate space that we previously reserved,
+		 * so the block group may become unused for a long time. For
+		 * example for metadata we generally reserve space for a worst
+		 * possible scenario, but then don't end up allocating all that
+		 * space or none at all (due to no need to COW, extent buffers
+		 * were already COWed in the current transaction and still
+		 * unwritten, tree heights lower than the maximum possible
+		 * height, etc). For data we generally reserve the axact amount
+		 * of space we are going to allocate later, the exception is
+		 * when using compression, as we must reserve space based on the
+		 * uncompressed data size, because the compression is only done
+		 * when writeback triggered and we don't know how much space we
+		 * are actually going to need, so we reserve the uncompressed
+		 * size because the data may be uncompressible in the worst case.
+		 */
+		if (ret == 0) {
+			bool used;
+
+			spin_lock(&block_group->lock);
+			used = btrfs_is_block_group_used(block_group);
+			spin_unlock(&block_group->lock);
+
+			if (!used)
+				btrfs_mark_bg_unused(block_group);
+		}
 	}
 	btrfs_trans_release_chunk_metadata(trans);
 }
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c4a1f01cc1c2..962b11983901 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
 	return (block_group->start + block_group->length);
 }
 
+static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
+{
+	lockdep_assert_held(&bg->lock);
+
+	return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+}
+
 static inline bool btrfs_is_block_group_data_only(
 					struct btrfs_block_group *block_group)
 {
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ceb5f586a2d5..1043a8142351 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -494,7 +494,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
 
 	block_rsv = get_block_rsv(trans, root);
 
-	if (unlikely(block_rsv->size == 0))
+	if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
 		goto try_reserve;
 again:
 	ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index b0bd12b8652f..43a9a6b5a79f 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -101,4 +101,36 @@ static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
 	return data_race(rsv->full);
 }
 
+/*
+ * Get the reserved mount of a block reserve in a context where getting a stale
+ * value is acceptable, instead of accessing it directly and trigger data race
+ * warning from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv)
+{
+	u64 ret;
+
+	spin_lock(&rsv->lock);
+	ret = rsv->reserved;
+	spin_unlock(&rsv->lock);
+
+	return ret;
+}
+
+/*
+ * Get the size of a block reserve in a context where getting a stale value is
+ * acceptable, instead of accessing it directly and trigger data race warning
+ * from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv)
+{
+	u64 ret;
+
+	spin_lock(&rsv->lock);
+	ret = rsv->size;
+	spin_unlock(&rsv->lock);
+
+	return ret;
+}
+
 #endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 193168214eeb..68345f73d429 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws,
 }
 
 static int compression_decompress(int type, struct list_head *ws,
-               const u8 *data_in, struct page *dest_page,
-               unsigned long start_byte, size_t srclen, size_t destlen)
+		const u8 *data_in, struct page *dest_page,
+		unsigned long dest_pgoff, size_t srclen, size_t destlen)
 {
 	switch (type) {
 	case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
-						start_byte, srclen, destlen);
+						dest_pgoff, srclen, destlen);
 	case BTRFS_COMPRESS_LZO:  return lzo_decompress(ws, data_in, dest_page,
-						start_byte, srclen, destlen);
+						dest_pgoff, srclen, destlen);
 	case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
-						start_byte, srclen, destlen);
+						dest_pgoff, srclen, destlen);
 	case BTRFS_COMPRESS_NONE:
 	default:
 		/*
@@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
  * start_byte tells us the offset into the compressed data we're interested in
  */
 int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
-		     unsigned long start_byte, size_t srclen, size_t destlen)
+		     unsigned long dest_pgoff, size_t srclen, size_t destlen)
 {
+	struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
 	struct list_head *workspace;
+	const u32 sectorsize = fs_info->sectorsize;
 	int ret;
 
+	/*
+	 * The full destination page range should not exceed the page size.
+	 * And the @destlen should not exceed sectorsize, as this is only called for
+	 * inline file extents, which should not exceed sectorsize.
+	 */
+	ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+
 	workspace = get_workspace(type, 0);
 	ret = compression_decompress(type, workspace, data_in, dest_page,
-				     start_byte, srclen, destlen);
+				     dest_pgoff, srclen, destlen);
 	put_workspace(type, workspace);
 
 	return ret;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 93cc92974dee..afd7e50d073d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 		unsigned long *total_in, unsigned long *total_out);
 int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int zlib_decompress(struct list_head *ws, const u8 *data_in,
-		struct page *dest_page, unsigned long start_byte, size_t srclen,
+		struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen);
 struct list_head *zlib_alloc_workspace(unsigned int level);
 void zlib_free_workspace(struct list_head *ws);
@@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 		unsigned long *total_in, unsigned long *total_out);
 int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int lzo_decompress(struct list_head *ws, const u8 *data_in,
-		struct page *dest_page, unsigned long start_byte, size_t srclen,
+		struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen);
 struct list_head *lzo_alloc_workspace(unsigned int level);
 void lzo_free_workspace(struct list_head *ws);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index c276b136ab63..5b0b64571418 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -1046,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
 			goto add;
 
 		/* Skip too large extent */
-		if (range_len >= extent_thresh)
+		if (em->len >= extent_thresh)
 			goto next;
 
 		/*
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 2833e8ef4c09..acf9f4b6c044 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 	u64 reserve_size = 0;
 	u64 qgroup_rsv_size = 0;
-	u64 csum_leaves;
 	unsigned outstanding_extents;
 
 	lockdep_assert_held(&inode->lock);
@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 						outstanding_extents);
 		reserve_size += btrfs_calc_metadata_size(fs_info, 1);
 	}
-	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
-						 inode->csum_bytes);
-	reserve_size += btrfs_calc_insert_metadata_size(fs_info,
-							csum_leaves);
+	if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
+		u64 csum_leaves;
+
+		csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
+		reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
+	}
 	/*
 	 * For qgroup rsv, the calculation is very simple:
 	 * account one nodesize for each outstanding extent
@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 	spin_unlock(&block_rsv->lock);
 }
 
-static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+static void calc_inode_reservations(struct btrfs_inode *inode,
 				    u64 num_bytes, u64 disk_num_bytes,
 				    u64 *meta_reserve, u64 *qgroup_reserve)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u64 nr_extents = count_max_extents(fs_info, num_bytes);
-	u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+	u64 csum_leaves;
 	u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
 
+	if (inode->flags & BTRFS_INODE_NODATASUM)
+		csum_leaves = 0;
+	else
+		csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+
 	*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
 						nr_extents + csum_leaves);
 
@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 	 * everything out and try again, which is bad.  This way we just
 	 * over-reserve slightly, and clean up the mess when we are done.
 	 */
-	calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+	calc_inode_reservations(inode, num_bytes, disk_num_bytes,
 				&meta_reserve, &qgroup_reserve);
 	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
 						 noflush);
@@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 	nr_extents = count_max_extents(fs_info, num_bytes);
 	spin_lock(&inode->lock);
 	btrfs_mod_outstanding_extents(inode, nr_extents);
-	inode->csum_bytes += disk_num_bytes;
+	if (!(inode->flags & BTRFS_INODE_NODATASUM))
+		inode->csum_bytes += disk_num_bytes;
 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
@@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 	spin_lock(&inode->lock);
-	inode->csum_bytes -= num_bytes;
+	if (!(inode->flags & BTRFS_INODE_NODATASUM))
+		inode->csum_bytes -= num_bytes;
 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1502d664c892..79c4293ddf37 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -725,6 +725,23 @@ leave:
 	return ret;
 }
 
+static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
+{
+	if (args->start.srcdevid == 0) {
+		if (memchr(args->start.srcdev_name, 0,
+			   sizeof(args->start.srcdev_name)) == NULL)
+			return -ENAMETOOLONG;
+	} else {
+		args->start.srcdev_name[0] = 0;
+	}
+
+	if (memchr(args->start.tgtdev_name, 0,
+		   sizeof(args->start.tgtdev_name)) == NULL)
+	    return -ENAMETOOLONG;
+
+	return 0;
+}
+
 int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
 			    struct btrfs_ioctl_dev_replace_args *args)
 {
@@ -737,10 +754,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
 	default:
 		return -EINVAL;
 	}
-
-	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
-	    args->start.tgtdev_name[0] == '\0')
-		return -EINVAL;
+	ret = btrfs_check_replace_dev_names(args);
+	if (ret < 0)
+		return ret;
 
 	ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
 					args->start.srcdevid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c6907d533fe8..c843563914ca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1307,12 +1307,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
  *
  * @objectid:	root id
  * @anon_dev:	preallocated anonymous block device number for new roots,
- * 		pass 0 for new allocation.
+ *		pass NULL for a new allocation.
  * @check_ref:	whether to check root item references, If true, return -ENOENT
  *		for orphan roots
  */
 static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
-					     u64 objectid, dev_t anon_dev,
+					     u64 objectid, dev_t *anon_dev,
 					     bool check_ref)
 {
 	struct btrfs_root *root;
@@ -1336,8 +1336,17 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
 again:
 	root = btrfs_lookup_fs_root(fs_info, objectid);
 	if (root) {
-		/* Shouldn't get preallocated anon_dev for cached roots */
-		ASSERT(!anon_dev);
+		/*
+		 * Some other caller may have read out the newly inserted
+		 * subvolume already (for things like backref walk etc).  Not
+		 * that common but still possible.  In that case, we just need
+		 * to free the anon_dev.
+		 */
+		if (unlikely(anon_dev && *anon_dev)) {
+			free_anon_bdev(*anon_dev);
+			*anon_dev = 0;
+		}
+
 		if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
 			btrfs_put_root(root);
 			return ERR_PTR(-ENOENT);
@@ -1357,7 +1366,7 @@ again:
 		goto fail;
 	}
 
-	ret = btrfs_init_fs_root(root, anon_dev);
+	ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
 	if (ret)
 		goto fail;
 
@@ -1393,7 +1402,7 @@ fail:
 	 * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
 	 * and once again by our caller.
 	 */
-	if (anon_dev)
+	if (anon_dev && *anon_dev)
 		root->anon_dev = 0;
 	btrfs_put_root(root);
 	return ERR_PTR(ret);
@@ -1409,7 +1418,7 @@ fail:
 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
 				     u64 objectid, bool check_ref)
 {
-	return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+	return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
 }
 
 /*
@@ -1417,11 +1426,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
  * the anonymous block device id
  *
  * @objectid:	tree objectid
- * @anon_dev:	if zero, allocate a new anonymous block device or use the
- *		parameter value
+ * @anon_dev:	if NULL, allocate a new anonymous block device or use the
+ *		parameter value if not NULL
  */
 struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
-					 u64 objectid, dev_t anon_dev)
+					 u64 objectid, dev_t *anon_dev)
 {
 	return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
 }
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9413726b329b..eb3473d1c1ac 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -61,7 +61,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
 				     u64 objectid, bool check_ref);
 struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
-					 u64 objectid, dev_t anon_dev);
+					 u64 objectid, dev_t *anon_dev);
 struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
 						 struct btrfs_path *path,
 						 u64 objectid);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f396aba92c57..8e8cc1111277 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 	u64 bytes_left, end;
 	u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
 
-	if (WARN_ON(start != aligned_start)) {
+	/* Adjust the range to be aligned to 512B sectors if necessary. */
+	if (start != aligned_start) {
 		len -= aligned_start - start;
 		len = round_down(len, 1 << SECTOR_SHIFT);
 		start = aligned_start;
@@ -4298,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+				    struct find_free_extent_ctl *ffe_ctl)
+{
+	if (ffe_ctl->for_treelog) {
+		spin_lock(&fs_info->treelog_bg_lock);
+		if (fs_info->treelog_bg)
+			ffe_ctl->hint_byte = fs_info->treelog_bg;
+		spin_unlock(&fs_info->treelog_bg_lock);
+	} else if (ffe_ctl->for_data_reloc) {
+		spin_lock(&fs_info->relocation_bg_lock);
+		if (fs_info->data_reloc_bg)
+			ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+		spin_unlock(&fs_info->relocation_bg_lock);
+	} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+		struct btrfs_block_group *block_group;
+
+		spin_lock(&fs_info->zone_active_bgs_lock);
+		list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+			/*
+			 * No lock is OK here because avail is monotinically
+			 * decreasing, and this is just a hint.
+			 */
+			u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+			if (block_group_bits(block_group, ffe_ctl->flags) &&
+			    avail >= ffe_ctl->num_bytes) {
+				ffe_ctl->hint_byte = block_group->start;
+				break;
+			}
+		}
+		spin_unlock(&fs_info->zone_active_bgs_lock);
+	}
+
+	return 0;
+}
+
 static int prepare_allocation(struct btrfs_fs_info *fs_info,
 			      struct find_free_extent_ctl *ffe_ctl,
 			      struct btrfs_space_info *space_info,
@@ -4308,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
 		return prepare_allocation_clustered(fs_info, ffe_ctl,
 						    space_info, ins);
 	case BTRFS_EXTENT_ALLOC_ZONED:
-		if (ffe_ctl->for_treelog) {
-			spin_lock(&fs_info->treelog_bg_lock);
-			if (fs_info->treelog_bg)
-				ffe_ctl->hint_byte = fs_info->treelog_bg;
-			spin_unlock(&fs_info->treelog_bg_lock);
-		}
-		if (ffe_ctl->for_data_reloc) {
-			spin_lock(&fs_info->relocation_bg_lock);
-			if (fs_info->data_reloc_bg)
-				ffe_ctl->hint_byte = fs_info->data_reloc_bg;
-			spin_unlock(&fs_info->relocation_bg_lock);
-		}
-		return 0;
+		return prepare_allocation_zoned(fs_info, ffe_ctl);
 	default:
 		BUG();
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cfd2967f04a2..8b4bef05e222 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2480,6 +2480,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
 				struct fiemap_cache *cache,
 				u64 offset, u64 phys, u64 len, u32 flags)
 {
+	u64 cache_end;
 	int ret = 0;
 
 	/* Set at the end of extent_fiemap(). */
@@ -2489,15 +2490,102 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
 		goto assign;
 
 	/*
-	 * Sanity check, extent_fiemap() should have ensured that new
-	 * fiemap extent won't overlap with cached one.
-	 * Not recoverable.
+	 * When iterating the extents of the inode, at extent_fiemap(), we may
+	 * find an extent that starts at an offset behind the end offset of the
+	 * previous extent we processed. This happens if fiemap is called
+	 * without FIEMAP_FLAG_SYNC and there are ordered extents completing
+	 * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()).
 	 *
-	 * NOTE: Physical address can overlap, due to compression
+	 * For example we are in leaf X processing its last item, which is the
+	 * file extent item for file range [512K, 1M[, and after
+	 * btrfs_next_leaf() releases the path, there's an ordered extent that
+	 * completes for the file range [768K, 2M[, and that results in trimming
+	 * the file extent item so that it now corresponds to the file range
+	 * [512K, 768K[ and a new file extent item is inserted for the file
+	 * range [768K, 2M[, which may end up as the last item of leaf X or as
+	 * the first item of the next leaf - in either case btrfs_next_leaf()
+	 * will leave us with a path pointing to the new extent item, for the
+	 * file range [768K, 2M[, since that's the first key that follows the
+	 * last one we processed. So in order not to report overlapping extents
+	 * to user space, we trim the length of the previously cached extent and
+	 * emit it.
+	 *
+	 * Upon calling btrfs_next_leaf() we may also find an extent with an
+	 * offset smaller than or equals to cache->offset, and this happens
+	 * when we had a hole or prealloc extent with several delalloc ranges in
+	 * it, but after btrfs_next_leaf() released the path, delalloc was
+	 * flushed and the resulting ordered extents were completed, so we can
+	 * now have found a file extent item for an offset that is smaller than
+	 * or equals to what we have in cache->offset. We deal with this as
+	 * described below.
 	 */
-	if (cache->offset + cache->len > offset) {
-		WARN_ON(1);
-		return -EINVAL;
+	cache_end = cache->offset + cache->len;
+	if (cache_end > offset) {
+		if (offset == cache->offset) {
+			/*
+			 * We cached a dealloc range (found in the io tree) for
+			 * a hole or prealloc extent and we have now found a
+			 * file extent item for the same offset. What we have
+			 * now is more recent and up to date, so discard what
+			 * we had in the cache and use what we have just found.
+			 */
+			goto assign;
+		} else if (offset > cache->offset) {
+			/*
+			 * The extent range we previously found ends after the
+			 * offset of the file extent item we found and that
+			 * offset falls somewhere in the middle of that previous
+			 * extent range. So adjust the range we previously found
+			 * to end at the offset of the file extent item we have
+			 * just found, since this extent is more up to date.
+			 * Emit that adjusted range and cache the file extent
+			 * item we have just found. This corresponds to the case
+			 * where a previously found file extent item was split
+			 * due to an ordered extent completing.
+			 */
+			cache->len = offset - cache->offset;
+			goto emit;
+		} else {
+			const u64 range_end = offset + len;
+
+			/*
+			 * The offset of the file extent item we have just found
+			 * is behind the cached offset. This means we were
+			 * processing a hole or prealloc extent for which we
+			 * have found delalloc ranges (in the io tree), so what
+			 * we have in the cache is the last delalloc range we
+			 * found while the file extent item we found can be
+			 * either for a whole delalloc range we previously
+			 * emmitted or only a part of that range.
+			 *
+			 * We have two cases here:
+			 *
+			 * 1) The file extent item's range ends at or behind the
+			 *    cached extent's end. In this case just ignore the
+			 *    current file extent item because we don't want to
+			 *    overlap with previous ranges that may have been
+			 *    emmitted already;
+			 *
+			 * 2) The file extent item starts behind the currently
+			 *    cached extent but its end offset goes beyond the
+			 *    end offset of the cached extent. We don't want to
+			 *    overlap with a previous range that may have been
+			 *    emmitted already, so we emit the currently cached
+			 *    extent and then partially store the current file
+			 *    extent item's range in the cache, for the subrange
+			 *    going the cached extent's end to the end of the
+			 *    file extent item.
+			 */
+			if (range_end <= cache_end)
+				return 0;
+
+			if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
+				phys += cache_end - offset;
+
+			offset = cache_end;
+			len = range_end - cache_end;
+			goto emit;
+		}
 	}
 
 	/*
@@ -2517,6 +2605,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
 		return 0;
 	}
 
+emit:
 	/* Not mergeable, need to submit cached one */
 	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
 				      cache->len, cache->flags);
@@ -2689,16 +2778,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 	 * it beyond i_size.
 	 */
 	while (cur_offset < end && cur_offset < i_size) {
+		struct extent_state *cached_state = NULL;
 		u64 delalloc_start;
 		u64 delalloc_end;
 		u64 prealloc_start;
+		u64 lockstart;
+		u64 lockend;
 		u64 prealloc_len = 0;
 		bool delalloc;
 
+		lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
+		lockend = round_up(end, inode->root->fs_info->sectorsize);
+
+		/*
+		 * We are only locking for the delalloc range because that's the
+		 * only thing that can change here.  With fiemap we have a lock
+		 * on the inode, so no buffered or direct writes can happen.
+		 *
+		 * However mmaps and normal page writeback will cause this to
+		 * change arbitrarily.  We have to lock the extent lock here to
+		 * make sure that nobody messes with the tree while we're doing
+		 * btrfs_find_delalloc_in_range.
+		 */
+		lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 		delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
 							delalloc_cached_state,
 							&delalloc_start,
 							&delalloc_end);
+		unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 		if (!delalloc)
 			break;
 
@@ -2866,15 +2973,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		  u64 start, u64 len)
 {
 	const u64 ino = btrfs_ino(inode);
-	struct extent_state *cached_state = NULL;
 	struct extent_state *delalloc_cached_state = NULL;
 	struct btrfs_path *path;
 	struct fiemap_cache cache = { 0 };
 	struct btrfs_backref_share_check_ctx *backref_ctx;
 	u64 last_extent_end;
 	u64 prev_extent_end;
-	u64 lockstart;
-	u64 lockend;
+	u64 range_start;
+	u64 range_end;
+	const u64 sectorsize = inode->root->fs_info->sectorsize;
 	bool stopped = false;
 	int ret;
 
@@ -2885,22 +2992,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		goto out;
 	}
 
-	lockstart = round_down(start, inode->root->fs_info->sectorsize);
-	lockend = round_up(start + len, inode->root->fs_info->sectorsize);
-	prev_extent_end = lockstart;
-
-	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
-	lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+	range_start = round_down(start, sectorsize);
+	range_end = round_up(start + len, sectorsize);
+	prev_extent_end = range_start;
 
 	ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
 	if (ret < 0)
-		goto out_unlock;
+		goto out;
 	btrfs_release_path(path);
 
 	path->reada = READA_FORWARD;
-	ret = fiemap_search_slot(inode, path, lockstart);
+	ret = fiemap_search_slot(inode, path, range_start);
 	if (ret < 0) {
-		goto out_unlock;
+		goto out;
 	} else if (ret > 0) {
 		/*
 		 * No file extent item found, but we may have delalloc between
@@ -2910,7 +3014,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		goto check_eof_delalloc;
 	}
 
-	while (prev_extent_end < lockend) {
+	while (prev_extent_end < range_end) {
 		struct extent_buffer *leaf = path->nodes[0];
 		struct btrfs_file_extent_item *ei;
 		struct btrfs_key key;
@@ -2933,21 +3037,21 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		 * The first iteration can leave us at an extent item that ends
 		 * before our range's start. Move to the next item.
 		 */
-		if (extent_end <= lockstart)
+		if (extent_end <= range_start)
 			goto next_item;
 
 		backref_ctx->curr_leaf_bytenr = leaf->start;
 
 		/* We have in implicit hole (NO_HOLES feature enabled). */
 		if (prev_extent_end < key.offset) {
-			const u64 range_end = min(key.offset, lockend) - 1;
+			const u64 hole_end = min(key.offset, range_end) - 1;
 
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
 						  &delalloc_cached_state,
 						  backref_ctx, 0, 0, 0,
-						  prev_extent_end, range_end);
+						  prev_extent_end, hole_end);
 			if (ret < 0) {
-				goto out_unlock;
+				goto out;
 			} else if (ret > 0) {
 				/* fiemap_fill_next_extent() told us to stop. */
 				stopped = true;
@@ -2955,7 +3059,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 			}
 
 			/* We've reached the end of the fiemap range, stop. */
-			if (key.offset >= lockend) {
+			if (key.offset >= range_end) {
 				stopped = true;
 				break;
 			}
@@ -3003,7 +3107,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 								  extent_gen,
 								  backref_ctx);
 				if (ret < 0)
-					goto out_unlock;
+					goto out;
 				else if (ret > 0)
 					flags |= FIEMAP_EXTENT_SHARED;
 			}
@@ -3014,7 +3118,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		}
 
 		if (ret < 0) {
-			goto out_unlock;
+			goto out;
 		} else if (ret > 0) {
 			/* fiemap_fill_next_extent() told us to stop. */
 			stopped = true;
@@ -3025,12 +3129,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 next_item:
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
-			goto out_unlock;
+			goto out;
 		}
 
 		ret = fiemap_next_leaf_item(inode, path);
 		if (ret < 0) {
-			goto out_unlock;
+			goto out;
 		} else if (ret > 0) {
 			/* No more file extent items for this inode. */
 			break;
@@ -3049,29 +3153,41 @@ check_eof_delalloc:
 	btrfs_free_path(path);
 	path = NULL;
 
-	if (!stopped && prev_extent_end < lockend) {
+	if (!stopped && prev_extent_end < range_end) {
 		ret = fiemap_process_hole(inode, fieinfo, &cache,
 					  &delalloc_cached_state, backref_ctx,
-					  0, 0, 0, prev_extent_end, lockend - 1);
+					  0, 0, 0, prev_extent_end, range_end - 1);
 		if (ret < 0)
-			goto out_unlock;
-		prev_extent_end = lockend;
+			goto out;
+		prev_extent_end = range_end;
 	}
 
 	if (cache.cached && cache.offset + cache.len >= last_extent_end) {
 		const u64 i_size = i_size_read(&inode->vfs_inode);
 
 		if (prev_extent_end < i_size) {
+			struct extent_state *cached_state = NULL;
 			u64 delalloc_start;
 			u64 delalloc_end;
+			u64 lockstart;
+			u64 lockend;
 			bool delalloc;
 
+			lockstart = round_down(prev_extent_end, sectorsize);
+			lockend = round_up(i_size, sectorsize);
+
+			/*
+			 * See the comment in fiemap_process_hole as to why
+			 * we're doing the locking here.
+			 */
+			lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 			delalloc = btrfs_find_delalloc_in_range(inode,
 								prev_extent_end,
 								i_size - 1,
 								&delalloc_cached_state,
 								&delalloc_start,
 								&delalloc_end);
+			unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 			if (!delalloc)
 				cache.flags |= FIEMAP_EXTENT_LAST;
 		} else {
@@ -3080,10 +3196,6 @@ check_eof_delalloc:
 	}
 
 	ret = emit_last_fiemap_cache(fieinfo, &cache);
-
-out_unlock:
-	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
-	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 out:
 	free_extent_state(delalloc_cached_state);
 	btrfs_free_backref_share_ctx(backref_ctx);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 809b11472a80..4795738d5785 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3184,8 +3184,23 @@ out:
 			unwritten_start += logical_len;
 		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
 
-		/* Drop extent maps for the part of the extent we didn't write. */
-		btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+		/*
+		 * Drop extent maps for the part of the extent we didn't write.
+		 *
+		 * We have an exception here for the free_space_inode, this is
+		 * because when we do btrfs_get_extent() on the free space inode
+		 * we will search the commit root.  If this is a new block group
+		 * we won't find anything, and we will trip over the assert in
+		 * writepage where we do ASSERT(em->block_start !=
+		 * EXTENT_MAP_HOLE).
+		 *
+		 * Theoretically we could also skip this for any NOCOW extent as
+		 * we don't mess with the extent map tree in the NOCOW case, but
+		 * for now simply skip this if we are the free space inode.
+		 */
+		if (!btrfs_is_free_space_inode(inode))
+			btrfs_drop_extent_map_range(inode, unwritten_start,
+						    end, false);
 
 		/*
 		 * If the ordered extent had an IOERR or something else went
@@ -4458,6 +4473,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 	u64 root_flags;
 	int ret;
 
+	down_write(&fs_info->subvol_sem);
+
 	/*
 	 * Don't allow to delete a subvolume with send in progress. This is
 	 * inside the inode lock so the error handling that has to drop the bit
@@ -4469,25 +4486,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 		btrfs_warn(fs_info,
 			   "attempt to delete subvolume %llu during send",
 			   dest->root_key.objectid);
-		return -EPERM;
+		ret = -EPERM;
+		goto out_up_write;
 	}
 	if (atomic_read(&dest->nr_swapfiles)) {
 		spin_unlock(&dest->root_item_lock);
 		btrfs_warn(fs_info,
 			   "attempt to delete subvolume %llu with active swapfile",
 			   root->root_key.objectid);
-		return -EPERM;
+		ret = -EPERM;
+		goto out_up_write;
 	}
 	root_flags = btrfs_root_flags(&dest->root_item);
 	btrfs_set_root_flags(&dest->root_item,
 			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
 	spin_unlock(&dest->root_item_lock);
 
-	down_write(&fs_info->subvol_sem);
-
 	ret = may_destroy_subvol(dest);
 	if (ret)
-		goto out_up_write;
+		goto out_undead;
 
 	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 	/*
@@ -4497,7 +4514,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 	 */
 	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
 	if (ret)
-		goto out_up_write;
+		goto out_undead;
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
@@ -4563,15 +4580,17 @@ out_end_trans:
 	inode->i_flags |= S_DEAD;
 out_release:
 	btrfs_subvolume_release_metadata(root, &block_rsv);
-out_up_write:
-	up_write(&fs_info->subvol_sem);
+out_undead:
 	if (ret) {
 		spin_lock(&dest->root_item_lock);
 		root_flags = btrfs_root_flags(&dest->root_item);
 		btrfs_set_root_flags(&dest->root_item,
 				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
 		spin_unlock(&dest->root_item_lock);
-	} else {
+	}
+out_up_write:
+	up_write(&fs_info->subvol_sem);
+	if (!ret) {
 		d_invalidate(dentry);
 		btrfs_prune_dentries(dest);
 		ASSERT(dest->send_in_progress == 0);
@@ -7816,6 +7835,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			u64 start, u64 len)
 {
+	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
 	int	ret;
 
 	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
@@ -7841,7 +7861,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			return ret;
 	}
 
-	return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
+	btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+	/*
+	 * We did an initial flush to avoid holding the inode's lock while
+	 * triggering writeback and waiting for the completion of IO and ordered
+	 * extents. Now after we locked the inode we do it again, because it's
+	 * possible a new write may have happened in between those two steps.
+	 */
+	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+		if (ret) {
+			btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+			return ret;
+		}
+	}
+
+	ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+	btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+	return ret;
 }
 
 static int btrfs_writepages(struct address_space *mapping,
@@ -10269,6 +10308,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
 		return -EINVAL;
 
+	/*
+	 * Compressed extents should always have checksums, so error out if we
+	 * have a NOCOW file or inode was created while mounted with NODATASUM.
+	 */
+	if (inode->flags & BTRFS_INODE_NODATASUM)
+		return -EINVAL;
+
 	orig_count = iov_iter_count(from);
 
 	/* The extent size must be sane. */
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 41b479861b3c..9d1eac15e09e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -721,7 +721,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
 	free_extent_buffer(leaf);
 	leaf = NULL;
 
-	new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
+	new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev);
 	if (IS_ERR(new_root)) {
 		ret = PTR_ERR(new_root);
 		btrfs_abort_transaction(trans, ret);
@@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 		return -EOPNOTSUPP;
 	}
 
+	if (btrfs_root_refs(&root->root_item) == 0)
+		return -ENOENT;
+
 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
 		return -EINVAL;
 
@@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 				ret = -EFAULT;
 				goto out;
 			}
+			if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+				ret = -EOPNOTSUPP;
+				goto out;
+			}
 			/* compression requires us to start the IO */
 			if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
 				range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
@@ -3808,6 +3815,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 		goto out;
 	}
 
+	if (sa->create && is_fstree(sa->qgroupid)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 1131d5a29d61..e43bc0fdc74e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 }
 
 int lzo_decompress(struct list_head *ws, const u8 *data_in,
-		struct page *dest_page, unsigned long start_byte, size_t srclen,
+		struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+	const u32 sectorsize = fs_info->sectorsize;
 	size_t in_len;
 	size_t out_len;
 	size_t max_segment_len = WORKSPACE_BUF_LENGTH;
 	int ret = 0;
-	char *kaddr;
-	unsigned long bytes;
 
 	if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
 		return -EUCLEAN;
@@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
 	}
 	data_in += LZO_LEN;
 
-	out_len = PAGE_SIZE;
+	out_len = sectorsize;
 	ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
 	if (ret != LZO_E_OK) {
 		pr_warn("BTRFS: decompress failed!\n");
@@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
 		goto out;
 	}
 
-	if (out_len < start_byte) {
+	ASSERT(out_len <= sectorsize);
+	memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len);
+	/* Early end, considered as an error. */
+	if (unlikely(out_len < destlen)) {
 		ret = -EIO;
-		goto out;
+		memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len);
 	}
-
-	/*
-	 * the caller is already checking against PAGE_SIZE, but lets
-	 * move this check closer to the memcpy/memset
-	 */
-	destlen = min_t(unsigned long, destlen, PAGE_SIZE);
-	bytes = min_t(unsigned long, destlen, out_len - start_byte);
-
-	kaddr = kmap_local_page(dest_page);
-	memcpy(kaddr, workspace->buf + start_byte, bytes);
-
-	/*
-	 * btrfs_getblock is doing a zero on the tail of the page too,
-	 * but this will cover anything missing from the decompressed
-	 * data.
-	 */
-	if (bytes < destlen)
-		memset(kaddr+bytes, 0, destlen-bytes);
-	kunmap_local(kaddr);
 out:
 	return ret;
 }
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 63b426cc7798..5470e1cdf10c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1736,6 +1736,15 @@ out:
 	return ret;
 }
 
+static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+{
+	return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
+		qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
+		qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
+		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
+		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+}
+
 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1755,6 +1764,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 		goto out;
 	}
 
+	if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	/* Check if there are no children of this qgroup */
 	if (!list_empty(&qgroup->members)) {
 		ret = -EBUSY;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 6486f0d7e993..8c4fc98ca9ce 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
 out_unlock:
 	spin_unlock(&fs_info->ref_verify_lock);
 out:
-	if (ret)
+	if (ret) {
+		btrfs_free_ref_cache(fs_info);
 		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+	}
 	return ret;
 }
 
@@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 		}
 	}
 	if (ret) {
-		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
 		btrfs_free_ref_cache(fs_info);
+		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
 	}
 	btrfs_free_path(path);
 	return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a01807cbd4d4..0123d2728923 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1098,12 +1098,22 @@ out:
 static void scrub_read_endio(struct btrfs_bio *bbio)
 {
 	struct scrub_stripe *stripe = bbio->private;
+	struct bio_vec *bvec;
+	int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+	int num_sectors;
+	u32 bio_size = 0;
+	int i;
+
+	ASSERT(sector_nr < stripe->nr_sectors);
+	bio_for_each_bvec_all(bvec, &bbio->bio, i)
+		bio_size += bvec->bv_len;
+	num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
 
 	if (bbio->bio.bi_status) {
-		bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
-		bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+		bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
+		bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
 	} else {
-		bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+		bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
 	}
 	bio_put(&bbio->bio);
 	if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
 {
 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
 	struct btrfs_bio *bbio = NULL;
+	unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+				      stripe->bg->length - stripe->logical) >>
+				  fs_info->sectorsize_bits;
 	u64 stripe_len = BTRFS_STRIPE_LEN;
 	int mirror = stripe->mirror_num;
 	int i;
@@ -1646,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
 		struct page *page = scrub_stripe_get_page(stripe, i);
 		unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
 
+		/* We're beyond the chunk boundary, no need to read anymore. */
+		if (i >= nr_sectors)
+			break;
+
 		/* The current sector cannot be merged, submit the bio. */
 		if (bbio &&
 		    ((i > 0 &&
@@ -1701,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct btrfs_bio *bbio;
+	unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+				      stripe->bg->length - stripe->logical) >>
+				  fs_info->sectorsize_bits;
 	int mirror = stripe->mirror_num;
 
 	ASSERT(stripe->bg);
@@ -1715,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 	bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
 			       scrub_read_endio, stripe);
 
-	/* Read the whole stripe. */
 	bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
-	for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
+	/* Read the whole range inside the chunk boundary. */
+	for (unsigned int cur = 0; cur < nr_sectors; cur++) {
+		struct page *page = scrub_stripe_get_page(stripe, cur);
+		unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
 		int ret;
 
-		ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
+		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
 		/* We should have allocated enough bio vectors. */
-		ASSERT(ret == PAGE_SIZE);
+		ASSERT(ret == fs_info->sectorsize);
 	}
 	atomic_inc(&stripe->pending_io);
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4e36550618e5..e48a063ef085 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6705,11 +6705,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 				if (ret)
 					goto out;
 			}
-			if (sctx->cur_inode_last_extent <
-			    sctx->cur_inode_size) {
-				ret = send_hole(sctx, sctx->cur_inode_size);
-				if (ret)
+			if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
+				ret = range_is_hole_in_parent(sctx,
+						      sctx->cur_inode_last_extent,
+						      sctx->cur_inode_size);
+				if (ret < 0) {
 					goto out;
+				} else if (ret == 0) {
+					ret = send_hole(sctx, sctx->cur_inode_size);
+					if (ret < 0)
+						goto out;
+				} else {
+					/* Range is already a hole, skip. */
+					ret = 0;
+				}
 			}
 		}
 		if (need_truncate) {
@@ -8111,7 +8120,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 	}
 
 	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
-		ret = -EINVAL;
+		ret = -EOPNOTSUPP;
 		goto out;
 	}
 
@@ -8205,8 +8214,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 		goto out;
 	}
 
-	sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
-				     arg->clone_sources_count + 1,
+	sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
+				     sizeof(*sctx->clone_roots),
 				     GFP_KERNEL);
 	if (!sctx->clone_roots) {
 		ret = -ENOMEM;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 571bb13587d5..3b54eb583474 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -856,7 +856,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
 static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
 				    struct btrfs_space_info *space_info)
 {
-	u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+	const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
 	u64 ordered, delalloc;
 	u64 thresh;
 	u64 used;
@@ -956,8 +956,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
 	ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
 	delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
 	if (ordered >= delalloc)
-		used += fs_info->delayed_refs_rsv.reserved +
-			fs_info->delayed_block_rsv.reserved;
+		used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) +
+			btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv);
 	else
 		used += space_info->bytes_may_use - global_rsv_size;
 
@@ -1173,7 +1173,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
 		enum btrfs_flush_state flush;
 		u64 delalloc_size = 0;
 		u64 to_reclaim, block_rsv_size;
-		u64 global_rsv_size = global_rsv->reserved;
+		const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
 
 		loops++;
 
@@ -1185,9 +1185,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
 		 * assume it's tied up in delalloc reservations.
 		 */
 		block_rsv_size = global_rsv_size +
-			delayed_block_rsv->reserved +
-			delayed_refs_rsv->reserved +
-			trans_rsv->reserved;
+			btrfs_block_rsv_reserved(delayed_block_rsv) +
+			btrfs_block_rsv_reserved(delayed_refs_rsv) +
+			btrfs_block_rsv_reserved(trans_rsv);
 		if (block_rsv_size < space_info->bytes_may_use)
 			delalloc_size = space_info->bytes_may_use - block_rsv_size;
 
@@ -1207,16 +1207,16 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
 			to_reclaim = delalloc_size;
 			flush = FLUSH_DELALLOC;
 		} else if (space_info->bytes_pinned >
-			   (delayed_block_rsv->reserved +
-			    delayed_refs_rsv->reserved)) {
+			   (btrfs_block_rsv_reserved(delayed_block_rsv) +
+			    btrfs_block_rsv_reserved(delayed_refs_rsv))) {
 			to_reclaim = space_info->bytes_pinned;
 			flush = COMMIT_TRANS;
-		} else if (delayed_block_rsv->reserved >
-			   delayed_refs_rsv->reserved) {
-			to_reclaim = delayed_block_rsv->reserved;
+		} else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
+			   btrfs_block_rsv_reserved(delayed_refs_rsv)) {
+			to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv);
 			flush = FLUSH_DELAYED_ITEMS_NR;
 		} else {
-			to_reclaim = delayed_refs_rsv->reserved;
+			to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv);
 			flush = FLUSH_DELAYED_REFS_NR;
 		}
 
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 93511d54abf8..0e49dab8dad2 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
-	folio_start_writeback(folio);
+	if (!folio_test_writeback(folio))
+		folio_start_writeback(folio);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 896acfda1789..101f786963d4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1457,6 +1457,14 @@ static int btrfs_reconfigure(struct fs_context *fc)
 
 	btrfs_info_to_ctx(fs_info, &old_ctx);
 
+	/*
+	 * This is our "bind mount" trick, we don't want to allow the user to do
+	 * anything other than mount a different ro/rw and a different subvol,
+	 * all of the mount options should be maintained.
+	 */
+	if (mount_reconfigure)
+		ctx->mount_opt = old_ctx.mount_opt;
+
 	sync_filesystem(sb);
 	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5b3333ceef04..bf8e64c766b6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
 					u64 num_bytes,
 					u64 *delayed_refs_bytes)
 {
-	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 	struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
-	u64 extra_delayed_refs_bytes = 0;
-	u64 bytes;
+	u64 bytes = num_bytes + *delayed_refs_bytes;
 	int ret;
 
 	/*
-	 * If there's a gap between the size of the delayed refs reserve and
-	 * its reserved space, than some tasks have added delayed refs or bumped
-	 * its size otherwise (due to block group creation or removal, or block
-	 * group item update). Also try to allocate that gap in order to prevent
-	 * using (and possibly abusing) the global reserve when committing the
-	 * transaction.
-	 */
-	if (flush == BTRFS_RESERVE_FLUSH_ALL &&
-	    !btrfs_block_rsv_full(delayed_refs_rsv)) {
-		spin_lock(&delayed_refs_rsv->lock);
-		if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
-			extra_delayed_refs_bytes = delayed_refs_rsv->size -
-				delayed_refs_rsv->reserved;
-		spin_unlock(&delayed_refs_rsv->lock);
-	}
-
-	bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
-
-	/*
 	 * We want to reserve all the bytes we may need all at once, so we only
 	 * do 1 enospc flushing cycle per transaction start.
 	 */
 	ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
-	if (ret == 0) {
-		if (extra_delayed_refs_bytes > 0)
-			btrfs_migrate_to_delayed_refs_rsv(fs_info,
-							  extra_delayed_refs_bytes);
-		return 0;
-	}
-
-	if (extra_delayed_refs_bytes > 0) {
-		bytes -= extra_delayed_refs_bytes;
-		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
-		if (ret == 0)
-			return 0;
-	}
 
 	/*
 	 * If we are an emergency flush, which can steal from the global block
 	 * reserve, then attempt to not reserve space for the delayed refs, as
 	 * we will consume space for them from the global block reserve.
 	 */
-	if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+	if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
 		bytes -= *delayed_refs_bytes;
 		*delayed_refs_bytes = 0;
 		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
@@ -1868,7 +1834,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	}
 
 	key.offset = (u64)-1;
-	pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
+	pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev);
 	if (IS_ERR(pending->snap)) {
 		ret = PTR_ERR(pending->snap);
 		pending->snap = NULL;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 50fdc69fdddf..6eccf8496486 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf,
 		if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
 			extent_err(leaf, slot,
 "inline ref item overflows extent item, ptr %lu iref size %u end %lu",
-				   ptr, inline_type, end);
+				   ptr, btrfs_extent_inline_ref_size(inline_type), end);
 			return -EUCLEAN;
 		}
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4c32497311d2..d67785be2c77 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
 	map = btrfs_find_chunk_map(fs_info, logical, length);
 
 	if (unlikely(!map)) {
-		read_unlock(&fs_info->mapping_tree_lock);
 		btrfs_crit(fs_info,
 			   "unable to find chunk map for logical %llu length %llu",
 			   logical, length);
@@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
 	}
 
 	if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
-		read_unlock(&fs_info->mapping_tree_lock);
 		btrfs_crit(fs_info,
 			   "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
 			   logical, logical + length, map->start,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 36cf1f0e338e..8da66ea699e8 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -354,18 +354,13 @@ done:
 }
 
 int zlib_decompress(struct list_head *ws, const u8 *data_in,
-		struct page *dest_page, unsigned long start_byte, size_t srclen,
+		struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	int ret = 0;
 	int wbits = MAX_WBITS;
-	unsigned long bytes_left;
-	unsigned long total_out = 0;
-	unsigned long pg_offset = 0;
-
-	destlen = min_t(unsigned long, destlen, PAGE_SIZE);
-	bytes_left = destlen;
+	unsigned long to_copy;
 
 	workspace->strm.next_in = data_in;
 	workspace->strm.avail_in = srclen;
@@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
 		return -EIO;
 	}
 
-	while (bytes_left > 0) {
-		unsigned long buf_start;
-		unsigned long buf_offset;
-		unsigned long bytes;
-
-		ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
-		if (ret != Z_OK && ret != Z_STREAM_END)
-			break;
-
-		buf_start = total_out;
-		total_out = workspace->strm.total_out;
-
-		if (total_out == buf_start) {
-			ret = -EIO;
-			break;
-		}
-
-		if (total_out <= start_byte)
-			goto next;
-
-		if (total_out > start_byte && buf_start < start_byte)
-			buf_offset = start_byte - buf_start;
-		else
-			buf_offset = 0;
-
-		bytes = min(PAGE_SIZE - pg_offset,
-			    PAGE_SIZE - (buf_offset % PAGE_SIZE));
-		bytes = min(bytes, bytes_left);
+	/*
+	 * Everything (in/out buf) should be at most one sector, there should
+	 * be no need to switch any input/output buffer.
+	 */
+	ret = zlib_inflate(&workspace->strm, Z_FINISH);
+	to_copy = min(workspace->strm.total_out, destlen);
+	if (ret != Z_STREAM_END)
+		goto out;
 
-		memcpy_to_page(dest_page, pg_offset,
-			       workspace->buf + buf_offset, bytes);
+	memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy);
 
-		pg_offset += bytes;
-		bytes_left -= bytes;
-next:
-		workspace->strm.next_out = workspace->buf;
-		workspace->strm.avail_out = workspace->buf_size;
-	}
-
-	if (ret != Z_STREAM_END && bytes_left != 0)
+out:
+	if (unlikely(to_copy != destlen)) {
+		pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n",
+					to_copy, destlen);
 		ret = -EIO;
-	else
+	} else {
 		ret = 0;
+	}
 
 	zlib_inflateEnd(&workspace->strm);
 
-	/*
-	 * this should only happen if zlib returned fewer bytes than we
-	 * expected.  btrfs_get_block is responsible for zeroing from the
-	 * end of the inline extent (destlen) to the end of the page
-	 */
-	if (pg_offset < destlen) {
-		memzero_page(dest_page, pg_offset, destlen - pg_offset);
-	}
+	if (unlikely(to_copy < destlen))
+		memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
 	return ret;
 }
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 5bd76813b23f..5f750fa53a2b 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1639,6 +1639,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 	}
 
 out:
+	/* Reject non SINGLE data profiles without RST */
+	if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
+	    (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+	    !fs_info->stripe_root) {
+		btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+			  btrfs_bg_type_to_raid_name(map->type));
+		return -EINVAL;
+	}
+
 	if (cache->alloc_offset > cache->zone_capacity) {
 		btrfs_err(fs_info,
 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
@@ -1670,6 +1679,7 @@ out:
 	}
 	bitmap_free(active);
 	kfree(zone_info);
+	btrfs_free_chunk_map(map);
 
 	return ret;
 }
@@ -2055,6 +2065,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 
 	map = block_group->physical_map;
 
+	spin_lock(&fs_info->zone_active_bgs_lock);
 	spin_lock(&block_group->lock);
 	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
 		ret = true;
@@ -2067,7 +2078,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 		goto out_unlock;
 	}
 
-	spin_lock(&fs_info->zone_active_bgs_lock);
 	for (i = 0; i < map->num_stripes; i++) {
 		struct btrfs_zoned_device_info *zinfo;
 		int reserved = 0;
@@ -2087,20 +2097,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 		 */
 		if (atomic_read(&zinfo->active_zones_left) <= reserved) {
 			ret = false;
-			spin_unlock(&fs_info->zone_active_bgs_lock);
 			goto out_unlock;
 		}
 
 		if (!btrfs_dev_set_active_zone(device, physical)) {
 			/* Cannot activate the zone */
 			ret = false;
-			spin_unlock(&fs_info->zone_active_bgs_lock);
 			goto out_unlock;
 		}
 		if (!is_data)
 			zinfo->reserved_active_zones--;
 	}
-	spin_unlock(&fs_info->zone_active_bgs_lock);
 
 	/* Successfully activated all the zones */
 	set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
@@ -2108,8 +2115,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 
 	/* For the active block group list */
 	btrfs_get_block_group(block_group);
-
-	spin_lock(&fs_info->zone_active_bgs_lock);
 	list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
 	spin_unlock(&fs_info->zone_active_bgs_lock);
 
@@ -2117,6 +2122,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 
 out_unlock:
 	spin_unlock(&block_group->lock);
+	spin_unlock(&fs_info->zone_active_bgs_lock);
 	return ret;
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index d3bcf601d3e5..4f73d23c2c46 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -55,7 +55,7 @@
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
-			  struct writeback_control *wbc);
+			  enum rw_hint hint, struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
  * a successful fsync().  For example, ext2 indirect blocks need to be
  * written back and waited upon before fsync() returns.
  *
- * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
+ * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
  * management of a list of dependent buffers at ->i_mapping->i_private_list.
  *
@@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+				      inode->i_write_hint, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -1944,7 +1945,8 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+				      inode->i_write_hint, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
 }
 
 static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+			  enum rw_hint write_hint,
 			  struct writeback_control *wbc)
 {
 	const enum req_op op = opf & REQ_OP_MASK;
@@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
 	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
 
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_write_hint = write_hint;
 
 	__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 
@@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
 
 void submit_bh(blk_opf_t opf, struct buffer_head *bh)
 {
-	submit_bh_wbc(opf, bh, NULL);
+	submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
@@ -3121,12 +3125,8 @@ void __init buffer_init(void)
 	unsigned long nrpages;
 	int ret;
 
-	bh_cachep = kmem_cache_create("buffer_head",
-			sizeof(struct buffer_head), 0,
-				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-				SLAB_MEM_SPREAD),
-				NULL);
-
+	bh_cachep = KMEM_CACHE(buffer_head,
+				SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
 	 */
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index 7077f72e6f47..f449f7340aad 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -168,6 +168,8 @@ error_unsupported:
 	dput(root);
 error_open_root:
 	cachefiles_end_secure(cache, saved_cred);
+	put_cred(cache->cache_cred);
+	cache->cache_cred = NULL;
 error_getsec:
 	fscache_relinquish_cache(cache_cookie);
 	cache->cache = NULL;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 3f24905f4066..6465e2574230 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
 	cachefiles_put_directory(cache->graveyard);
 	cachefiles_put_directory(cache->store);
 	mntput(cache->mnt);
+	put_cred(cache->cache_cred);
 
 	kfree(cache->rootdirname);
 	kfree(cache->secctx);
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 5fd74ec60bef..4ba42f1fa3b4 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -539,6 +539,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
 	struct fscache_volume *volume = object->volume->vcookie;
 	size_t volume_key_size, cookie_key_size, data_len;
 
+	if (!object->ondemand)
+		return 0;
+
 	/*
 	 * CacheFiles will firstly check the cache file under the root cache
 	 * directory. If the coherency check failed, it will fallback to
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9c02f328c966..7fb4aae97412 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1452,7 +1452,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
 	if (flushing & CEPH_CAP_XATTR_EXCL) {
 		arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
 		arg->xattr_version = ci->i_xattrs.version;
-		arg->xattr_buf = ci->i_xattrs.blob;
+		arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob);
 	} else {
 		arg->xattr_buf = NULL;
 		arg->old_xattr_buf = NULL;
@@ -1553,6 +1553,7 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
 	encode_cap_msg(msg, arg);
 	ceph_con_send(&arg->session->s_con, msg);
 	ceph_buffer_put(arg->old_xattr_buf);
+	ceph_buffer_put(arg->xattr_buf);
 	if (arg->wake)
 		wake_up_all(&ci->i_cap_wq);
 }
@@ -2155,6 +2156,30 @@ retry:
 		      ceph_cap_string(cap->implemented),
 		      ceph_cap_string(revoking));
 
+		/* completed revocation? going down and there are no caps? */
+		if (revoking) {
+			if ((revoking & cap_used) == 0) {
+				doutc(cl, "completed revocation of %s\n",
+				      ceph_cap_string(cap->implemented & ~cap->issued));
+				goto ack;
+			}
+
+			/*
+			 * If the "i_wrbuffer_ref" was increased by mmap or generic
+			 * cache write just before the ceph_check_caps() is called,
+			 * the Fb capability revoking will fail this time. Then we
+			 * must wait for the BDI's delayed work to flush the dirty
+			 * pages and to release the "i_wrbuffer_ref", which will cost
+			 * at most 5 seconds. That means the MDS needs to wait at
+			 * most 5 seconds to finished the Fb capability's revocation.
+			 *
+			 * Let's queue a writeback for it.
+			 */
+			if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+			    (revoking & CEPH_CAP_FILE_BUFFER))
+				queue_writeback = true;
+		}
+
 		if (cap == ci->i_auth_cap &&
 		    (cap->issued & CEPH_CAP_FILE_WR)) {
 			/* request larger max_size from MDS? */
@@ -2182,30 +2207,6 @@ retry:
 			}
 		}
 
-		/* completed revocation? going down and there are no caps? */
-		if (revoking) {
-			if ((revoking & cap_used) == 0) {
-				doutc(cl, "completed revocation of %s\n",
-				      ceph_cap_string(cap->implemented & ~cap->issued));
-				goto ack;
-			}
-
-			/*
-			 * If the "i_wrbuffer_ref" was increased by mmap or generic
-			 * cache write just before the ceph_check_caps() is called,
-			 * the Fb capability revoking will fail this time. Then we
-			 * must wait for the BDI's delayed work to flush the dirty
-			 * pages and to release the "i_wrbuffer_ref", which will cost
-			 * at most 5 seconds. That means the MDS needs to wait at
-			 * most 5 seconds to finished the Fb capability's revocation.
-			 *
-			 * Let's queue a writeback for it.
-			 */
-			if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
-			    (revoking & CEPH_CAP_FILE_BUFFER))
-				queue_writeback = true;
-		}
-
 		/* want more caps from mds? */
 		if (want & ~cap->mds_wanted) {
 			if (want & ~(cap->mds_wanted | cap->issued))
@@ -3215,7 +3216,6 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
 
 enum put_cap_refs_mode {
 	PUT_CAP_REFS_SYNC = 0,
-	PUT_CAP_REFS_NO_CHECK,
 	PUT_CAP_REFS_ASYNC,
 };
 
@@ -3331,11 +3331,6 @@ void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
 	__ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
 }
 
-void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
-{
-	__ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK);
-}
-
 /*
  * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
  * context.  Adjust per-snap dirty page accounting as appropriate.
@@ -4777,7 +4772,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
 		if (__ceph_caps_dirty(ci)) {
 			struct ceph_mds_client *mdsc =
 				ceph_inode_to_fs_client(inode)->mdsc;
-			__cap_delay_requeue_front(mdsc, ci);
+
+			doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
+			      ceph_vinop(inode));
+			spin_lock(&mdsc->cap_unlink_delay_lock);
+			ci->i_ceph_flags |= CEPH_I_FLUSH;
+			if (!list_empty(&ci->i_cap_delay_list))
+				list_del_init(&ci->i_cap_delay_list);
+			list_add_tail(&ci->i_cap_delay_list,
+				      &mdsc->cap_unlink_delay_list);
+			spin_unlock(&mdsc->cap_unlink_delay_lock);
+
+			/*
+			 * Fire the work immediately, because the MDS maybe
+			 * waiting for caps release.
+			 */
+			ceph_queue_cap_unlink_work(mdsc);
 		}
 	}
 	spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0c25d326afc4..7b2e77517f23 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -78,6 +78,8 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
+
 	if (!S_ISLNK(*mode)) {
 		err = ceph_pre_init_acls(dir, mode, as_ctx);
 		if (err < 0)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 548d1de379f3..3ab9c268a8bb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1089,7 +1089,7 @@ void ceph_mdsc_release_request(struct kref *kref)
 	struct ceph_mds_request *req = container_of(kref,
 						    struct ceph_mds_request,
 						    r_kref);
-	ceph_mdsc_release_dir_caps_no_check(req);
+	ceph_mdsc_release_dir_caps_async(req);
 	destroy_reply_info(&req->r_reply_info);
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
@@ -2484,6 +2484,50 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
 	}
 }
 
+void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc)
+{
+	struct ceph_client *cl = mdsc->fsc->client;
+	if (mdsc->stopping)
+		return;
+
+        if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) {
+                doutc(cl, "caps unlink work queued\n");
+        } else {
+                doutc(cl, "failed to queue caps unlink work\n");
+        }
+}
+
+static void ceph_cap_unlink_work(struct work_struct *work)
+{
+	struct ceph_mds_client *mdsc =
+		container_of(work, struct ceph_mds_client, cap_unlink_work);
+	struct ceph_client *cl = mdsc->fsc->client;
+
+	doutc(cl, "begin\n");
+	spin_lock(&mdsc->cap_unlink_delay_lock);
+	while (!list_empty(&mdsc->cap_unlink_delay_list)) {
+		struct ceph_inode_info *ci;
+		struct inode *inode;
+
+		ci = list_first_entry(&mdsc->cap_unlink_delay_list,
+				      struct ceph_inode_info,
+				      i_cap_delay_list);
+		list_del_init(&ci->i_cap_delay_list);
+
+		inode = igrab(&ci->netfs.inode);
+		if (inode) {
+			spin_unlock(&mdsc->cap_unlink_delay_lock);
+			doutc(cl, "on %p %llx.%llx\n", inode,
+			      ceph_vinop(inode));
+			ceph_check_caps(ci, CHECK_CAPS_FLUSH);
+			iput(inode);
+			spin_lock(&mdsc->cap_unlink_delay_lock);
+		}
+	}
+	spin_unlock(&mdsc->cap_unlink_delay_lock);
+	doutc(cl, "done\n");
+}
+
 /*
  * requests
  */
@@ -4261,7 +4305,7 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
 	}
 }
 
-void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
+void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req)
 {
 	struct ceph_client *cl = req->r_mdsc->fsc->client;
 	int dcaps;
@@ -4269,8 +4313,7 @@ void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
 	dcaps = xchg(&req->r_dir_caps, 0);
 	if (dcaps) {
 		doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
-		ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
-						dcaps);
+		ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps);
 	}
 }
 
@@ -4306,7 +4349,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
 		if (req->r_session->s_mds != session->s_mds)
 			continue;
 
-		ceph_mdsc_release_dir_caps_no_check(req);
+		ceph_mdsc_release_dir_caps_async(req);
 
 		__send_request(session, req, true);
 	}
@@ -5360,6 +5403,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	INIT_LIST_HEAD(&mdsc->cap_delay_list);
 	INIT_LIST_HEAD(&mdsc->cap_wait_list);
 	spin_lock_init(&mdsc->cap_delay_lock);
+	INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
+	spin_lock_init(&mdsc->cap_unlink_delay_lock);
 	INIT_LIST_HEAD(&mdsc->snap_flush_list);
 	spin_lock_init(&mdsc->snap_flush_lock);
 	mdsc->last_cap_flush_tid = 1;
@@ -5368,6 +5413,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	spin_lock_init(&mdsc->cap_dirty_lock);
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
 	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+	INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work);
 	err = ceph_metric_init(&mdsc->metric);
 	if (err)
 		goto err_mdsmap;
@@ -5641,6 +5687,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	ceph_cleanup_global_and_empty_realms(mdsc);
 
 	cancel_work_sync(&mdsc->cap_reclaim_work);
+	cancel_work_sync(&mdsc->cap_unlink_work);
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
 
 	doutc(cl, "done\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 2e6ddaa13d72..03f8ff00874f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -462,6 +462,8 @@ struct ceph_mds_client {
 	unsigned long    last_renew_caps;  /* last time we renewed our caps */
 	struct list_head cap_delay_list;   /* caps with delayed release */
 	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
+	struct list_head cap_unlink_delay_list;  /* caps with delayed release for unlink */
+	spinlock_t       cap_unlink_delay_lock;  /* protects cap_unlink_delay_list */
 	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
 	spinlock_t       snap_flush_lock;
 
@@ -475,6 +477,8 @@ struct ceph_mds_client {
 	struct work_struct cap_reclaim_work;
 	atomic_t	   cap_reclaim_pending;
 
+	struct work_struct cap_unlink_work;
+
 	/*
 	 * Cap reservations
 	 *
@@ -552,7 +556,7 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 				struct inode *dir,
 				struct ceph_mds_request *req);
 extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
-extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req);
 static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
 {
 	kref_get(&req->r_kref);
@@ -574,6 +578,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
 				    struct ceph_mds_session *session);
 extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
 extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
+extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc);
 extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
 				     int (*cb)(struct inode *, int mds, void *),
 				     void *arg);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index fae97c25ce58..8109aba66e02 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -380,10 +380,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
 		ceph_decode_skip_8(p, end, bad_ext);
 		/* required_client_features */
 		ceph_decode_skip_set(p, end, 64, bad_ext);
+		/* bal_rank_mask */
+		ceph_decode_skip_string(p, end, bad_ext);
+	}
+	if (mdsmap_ev >= 18) {
 		ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
-	} else {
-		/* This forces the usage of the (sync) SETXATTR Op */
-		m->m_max_xattr_size = 0;
 	}
 bad_ext:
 	doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
index 89f1931f1ba6..1f2171dd01bf 100644
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -27,7 +27,11 @@ struct ceph_mdsmap {
 	u32 m_session_timeout;          /* seconds */
 	u32 m_session_autoclose;        /* seconds */
 	u64 m_max_file_size;
-	u64 m_max_xattr_size;		/* maximum size for xattrs blob */
+	/*
+	 * maximum size for xattrs blob.
+	 * Zeroed by default to force the usage of the (sync) SETXATTR Op.
+	 */
+	u64 m_max_xattr_size;
 	u32 m_max_mds;			/* expected up:active mds number */
 	u32 m_num_active_mds;		/* actual up:active mds number */
 	u32 possible_max_rank;		/* possible max rank index */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b06e2bc86221..b63b4cd9b5b6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1255,8 +1255,6 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
 extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
 extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had);
-extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci,
-					    int had);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 				       struct ceph_snap_context *snapc);
 extern void __ceph_remove_capsnap(struct inode *inode,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 0c7c2528791e..a50356c541f6 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -24,6 +24,8 @@
 #include <linux/pid_namespace.h>
 #include <linux/uaccess.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/vmalloc.h>
 
 #include <linux/coda.h>
@@ -87,10 +89,10 @@ void coda_destroy_inodecache(void)
 	kmem_cache_destroy(coda_inode_cachep);
 }
 
-static int coda_remount(struct super_block *sb, int *flags, char *data)
+static int coda_reconfigure(struct fs_context *fc)
 {
-	sync_filesystem(sb);
-	*flags |= SB_NOATIME;
+	sync_filesystem(fc->root->d_sb);
+	fc->sb_flags |= SB_NOATIME;
 	return 0;
 }
 
@@ -102,78 +104,102 @@ static const struct super_operations coda_super_operations =
 	.evict_inode	= coda_evict_inode,
 	.put_super	= coda_put_super,
 	.statfs		= coda_statfs,
-	.remount_fs	= coda_remount,
 };
 
-static int get_device_index(struct coda_mount_data *data)
+struct coda_fs_context {
+	int	idx;
+};
+
+enum {
+	Opt_fd,
+};
+
+static const struct fs_parameter_spec coda_param_specs[] = {
+	fsparam_fd	("fd",	Opt_fd),
+	{}
+};
+
+static int coda_parse_fd(struct fs_context *fc, int fd)
 {
+	struct coda_fs_context *ctx = fc->fs_private;
 	struct fd f;
 	struct inode *inode;
 	int idx;
 
-	if (data == NULL) {
-		pr_warn("%s: Bad mount data\n", __func__);
-		return -1;
-	}
-
-	if (data->version != CODA_MOUNT_VERSION) {
-		pr_warn("%s: Bad mount version\n", __func__);
-		return -1;
-	}
-
-	f = fdget(data->fd);
+	f = fdget(fd);
 	if (!f.file)
-		goto Ebadf;
+		return -EBADF;
 	inode = file_inode(f.file);
 	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
 		fdput(f);
-		goto Ebadf;
+		return invalf(fc, "code: Not coda psdev");
 	}
 
 	idx = iminor(inode);
 	fdput(f);
 
-	if (idx < 0 || idx >= MAX_CODADEVS) {
-		pr_warn("%s: Bad minor number\n", __func__);
-		return -1;
+	if (idx < 0 || idx >= MAX_CODADEVS)
+		return invalf(fc, "coda: Bad minor number");
+	ctx->idx = idx;
+	return 0;
+}
+
+static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct fs_parse_result result;
+	int opt;
+
+	opt = fs_parse(fc, coda_param_specs, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_fd:
+		return coda_parse_fd(fc, result.uint_32);
 	}
 
-	return idx;
-Ebadf:
-	pr_warn("%s: Bad file\n", __func__);
-	return -1;
+	return 0;
+}
+
+/*
+ * Parse coda's binary mount data form.  We ignore any errors and go with index
+ * 0 if we get one for backward compatibility.
+ */
+static int coda_parse_monolithic(struct fs_context *fc, void *_data)
+{
+	struct coda_mount_data *data = _data;
+
+	if (!data)
+		return invalf(fc, "coda: Bad mount data");
+
+	if (data->version != CODA_MOUNT_VERSION)
+		return invalf(fc, "coda: Bad mount version");
+
+	coda_parse_fd(fc, data->fd);
+	return 0;
 }
 
-static int coda_fill_super(struct super_block *sb, void *data, int silent)
+static int coda_fill_super(struct super_block *sb, struct fs_context *fc)
 {
+	struct coda_fs_context *ctx = fc->fs_private;
 	struct inode *root = NULL;
 	struct venus_comm *vc;
 	struct CodaFid fid;
 	int error;
-	int idx;
-
-	if (task_active_pid_ns(current) != &init_pid_ns)
-		return -EINVAL;
-
-	idx = get_device_index((struct coda_mount_data *) data);
 
-	/* Ignore errors in data, for backward compatibility */
-	if(idx == -1)
-		idx = 0;
-	
-	pr_info("%s: device index: %i\n", __func__,  idx);
+	infof(fc, "coda: device index: %i\n", ctx->idx);
 
-	vc = &coda_comms[idx];
+	vc = &coda_comms[ctx->idx];
 	mutex_lock(&vc->vc_mutex);
 
 	if (!vc->vc_inuse) {
-		pr_warn("%s: No pseudo device\n", __func__);
+		errorf(fc, "coda: No pseudo device");
 		error = -EINVAL;
 		goto unlock_out;
 	}
 
 	if (vc->vc_sb) {
-		pr_warn("%s: Device already mounted\n", __func__);
+		errorf(fc, "coda: Device already mounted");
 		error = -EBUSY;
 		goto unlock_out;
 	}
@@ -313,18 +339,45 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0; 
 }
 
-/* init_coda: used by filesystems.c to register coda */
+static int coda_get_tree(struct fs_context *fc)
+{
+	if (task_active_pid_ns(current) != &init_pid_ns)
+		return -EINVAL;
 
-static struct dentry *coda_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+	return get_tree_nodev(fc, coda_fill_super);
+}
+
+static void coda_free_fc(struct fs_context *fc)
 {
-	return mount_nodev(fs_type, flags, data, coda_fill_super);
+	kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations coda_context_ops = {
+	.free		= coda_free_fc,
+	.parse_param	= coda_parse_param,
+	.parse_monolithic = coda_parse_monolithic,
+	.get_tree	= coda_get_tree,
+	.reconfigure	= coda_reconfigure,
+};
+
+static int coda_init_fs_context(struct fs_context *fc)
+{
+	struct coda_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct coda_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	fc->fs_private = ctx;
+	fc->ops = &coda_context_ops;
+	return 0;
 }
 
 struct file_system_type coda_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "coda",
-	.mount		= coda_mount,
+	.init_fs_context = coda_init_fs_context,
+	.parameters	= coda_param_specs,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/coredump.c b/fs/coredump.c
index f258c17c1841..be6403b4b14b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -872,6 +872,9 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	loff_t pos;
 	ssize_t n;
 
+	if (!page)
+		return 0;
+
 	if (cprm->to_skip) {
 		if (!__dump_skip(cprm, cprm->to_skip))
 			return 0;
@@ -884,7 +887,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	pos = file->f_pos;
 	bvec_set_page(&bvec, page, PAGE_SIZE, 0);
 	iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
-	iov_iter_set_copy_mc(&iter);
 	n = __kernel_write_iter(cprm->file, &iter, &pos);
 	if (n != PAGE_SIZE)
 		return 0;
@@ -895,10 +897,44 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	return 1;
 }
 
+/*
+ * If we might get machine checks from kernel accesses during the
+ * core dump, let's get those errors early rather than during the
+ * IO. This is not performance-critical enough to warrant having
+ * all the machine check logic in the iovec paths.
+ */
+#ifdef copy_mc_to_kernel
+
+#define dump_page_alloc() alloc_page(GFP_KERNEL)
+#define dump_page_free(x) __free_page(x)
+static struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+	void *buf = kmap_local_page(src);
+	size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE);
+	kunmap_local(buf);
+	return left ? NULL : dst;
+}
+
+#else
+
+/* We just want to return non-NULL; it's never used. */
+#define dump_page_alloc() ERR_PTR(-EINVAL)
+#define dump_page_free(x) ((void)(x))
+static inline struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+	return src;
+}
+#endif
+
 int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		    unsigned long len)
 {
 	unsigned long addr;
+	struct page *dump_page;
+
+	dump_page = dump_page_alloc();
+	if (!dump_page)
+		return 0;
 
 	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
 		struct page *page;
@@ -912,14 +948,17 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		 */
 		page = get_dump_page(addr);
 		if (page) {
-			int stop = !dump_emit_page(cprm, page);
+			int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
 			put_page(page);
-			if (stop)
+			if (stop) {
+				dump_page_free(dump_page);
 				return 0;
+			}
 		} else {
 			dump_skip(cprm, PAGE_SIZE);
 		}
 	}
+	dump_page_free(dump_page);
 	return 1;
 }
 #endif
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 7b3fc189593a..0ad52fbe51c9 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -74,13 +74,7 @@ struct fscrypt_nokey_name {
 
 static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
 {
-	if (str->len == 1 && str->name[0] == '.')
-		return true;
-
-	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
-		return true;
-
-	return false;
+	return is_dot_dotdot(str->name, str->len);
 }
 
 /**
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 52504dd478d3..104771c3d3f6 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -102,11 +102,8 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
 	if (err && err != -ENOENT)
 		return err;
 
-	if (fname->is_nokey_name) {
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags |= DCACHE_NOKEY_NAME;
-		spin_unlock(&dentry->d_lock);
-	}
+	fscrypt_prepare_dentry(dentry, fname->is_nokey_name);
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
@@ -131,12 +128,10 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
 int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry)
 {
 	int err = fscrypt_get_encryption_info(dir, true);
+	bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir));
+
+	fscrypt_prepare_dentry(dentry, is_nokey_name);
 
-	if (!err && !fscrypt_has_encryption_key(dir)) {
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags |= DCACHE_NOKEY_NAME;
-		spin_unlock(&dentry->d_lock);
-	}
 	return err;
 }
 EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial);
diff --git a/fs/dcache.c b/fs/dcache.c
index b813528fb147..71a8e943a0fa 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3061,7 +3061,10 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
 		if (d_unhashed(dentry) || !dentry->d_inode)
 			return D_WALK_SKIP;
 
-		dentry->d_lockref.count--;
+		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+			dentry->d_flags |= DCACHE_GENOCIDE;
+			dentry->d_lockref.count--;
+		}
 	}
 	return D_WALK_CONTINUE;
 }
@@ -3136,7 +3139,7 @@ static void __init dcache_init(void)
 	 * of the dcache.
 	 */
 	dentry_cache = KMEM_CACHE_USERCOPY(dentry,
-		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
 		d_iname);
 
 	/* Hash may have been set up in dcache_init_early */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 60456263a338..62c97ff9e852 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 		bio->bi_end_io = dio_bio_end_io;
 	if (dio->is_pinned)
 		bio_set_flag(bio, BIO_PAGE_PINNED);
+	bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint;
+
 	sdio->bio = bio;
 	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
 }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 03bd55069d86..2fe0f3af1a08 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1949,16 +1949,6 @@ out:
 	return rc;
 }
 
-static bool is_dot_dotdot(const char *name, size_t name_size)
-{
-	if (name_size == 1 && name[0] == '.')
-		return true;
-	else if (name_size == 2 && name[0] == '.' && name[1] == '.')
-		return true;
-
-	return false;
-}
-
 /**
  * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
  * @plaintext_name: The plaintext name
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 169252e6dc46..f7206158ee81 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -38,7 +38,7 @@ struct efivar_entry {
 
 int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 			    struct list_head *),
-		void *data, bool duplicates, struct list_head *head);
+		void *data, struct list_head *head);
 
 int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
 void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6038dd39367a..bb14462f6d99 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -343,12 +343,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (err)
 		return err;
 
-	err = efivar_init(efivarfs_callback, (void *)sb, true,
-			  &sfi->efivarfs_list);
-	if (err)
-		efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL);
-
-	return err;
+	return efivar_init(efivarfs_callback, sb, &sfi->efivarfs_list);
 }
 
 static int efivarfs_get_tree(struct fs_context *fc)
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index 114ff0fd4e55..4d722af1014f 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -361,7 +361,6 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
  * efivar_init - build the initial list of EFI variables
  * @func: callback function to invoke for every variable
  * @data: function-specific data to pass to @func
- * @duplicates: error if we encounter duplicates on @head?
  * @head: initialised head of variable list
  *
  * Get every EFI variable from the firmware and invoke @func. @func
@@ -371,9 +370,9 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
  */
 int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 			    struct list_head *),
-		void *data, bool duplicates, struct list_head *head)
+		void *data, struct list_head *head)
 {
-	unsigned long variable_name_size = 1024;
+	unsigned long variable_name_size = 512;
 	efi_char16_t *variable_name;
 	efi_status_t status;
 	efi_guid_t vendor_guid;
@@ -390,12 +389,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 		goto free;
 
 	/*
-	 * Per EFI spec, the maximum storage allocated for both
-	 * the variable name and variable data is 1024 bytes.
+	 * A small set of old UEFI implementations reject sizes
+	 * above a certain threshold, the lowest seen in the wild
+	 * is 512.
 	 */
 
 	do {
-		variable_name_size = 1024;
+		variable_name_size = 512;
 
 		status = efivar_get_next_variable(&variable_name_size,
 						  variable_name,
@@ -413,8 +413,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 			 * we'll ever see a different variable name,
 			 * and may end up looping here forever.
 			 */
-			if (duplicates &&
-			    variable_is_present(variable_name, &vendor_guid,
+			if (variable_is_present(variable_name, &vendor_guid,
 						head)) {
 				dup_variable_bug(variable_name, &vendor_guid,
 						 variable_name_size);
@@ -432,9 +431,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 			break;
 		case EFI_NOT_FOUND:
 			break;
+		case EFI_BUFFER_TOO_SMALL:
+			pr_warn("efivars: Variable name size exceeds maximum (%lu > 512)\n",
+				variable_name_size);
+			status = EFI_NOT_FOUND;
+			break;
 		default:
-			printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
-				status);
+			pr_warn("efivars: get_next_variable: status=%lx\n", status);
 			status = EFI_NOT_FOUND;
 			break;
 		}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f17fdac76b2e..e4421c10caeb 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -14,19 +14,14 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
-
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include "efs.h"
 #include <linux/efs_vh.h>
 #include <linux/efs_fs_sb.h>
 
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
-static int efs_fill_super(struct super_block *s, void *d, int silent);
-
-static struct dentry *efs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
-}
+static int efs_init_fs_context(struct fs_context *fc);
 
 static void efs_kill_sb(struct super_block *s)
 {
@@ -35,15 +30,6 @@ static void efs_kill_sb(struct super_block *s)
 	kfree(sbi);
 }
 
-static struct file_system_type efs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "efs",
-	.mount		= efs_mount,
-	.kill_sb	= efs_kill_sb,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("efs");
-
 static struct pt_types sgi_pt_types[] = {
 	{0x00,		"SGI vh"},
 	{0x01,		"SGI trkrepl"},
@@ -63,6 +49,27 @@ static struct pt_types sgi_pt_types[] = {
 	{0,		NULL}
 };
 
+enum {
+	Opt_explicit_open,
+};
+
+static const struct fs_parameter_spec efs_param_spec[] = {
+	fsparam_flag    ("explicit-open",       Opt_explicit_open),
+	{}
+};
+
+/*
+ * File system definition and registration.
+ */
+static struct file_system_type efs_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= "efs",
+	.kill_sb		= efs_kill_sb,
+	.fs_flags		= FS_REQUIRES_DEV,
+	.init_fs_context	= efs_init_fs_context,
+	.parameters		= efs_param_spec,
+};
+MODULE_ALIAS_FS("efs");
 
 static struct kmem_cache * efs_inode_cachep;
 
@@ -91,8 +98,8 @@ static int __init init_inodecache(void)
 {
 	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
 				sizeof(struct efs_inode_info), 0,
-				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-				SLAB_ACCOUNT, init_once);
+				SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+				init_once);
 	if (efs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
@@ -108,18 +115,10 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(efs_inode_cachep);
 }
 
-static int efs_remount(struct super_block *sb, int *flags, char *data)
-{
-	sync_filesystem(sb);
-	*flags |= SB_RDONLY;
-	return 0;
-}
-
 static const struct super_operations efs_superblock_operations = {
 	.alloc_inode	= efs_alloc_inode,
 	.free_inode	= efs_free_inode,
 	.statfs		= efs_statfs,
-	.remount_fs	= efs_remount,
 };
 
 static const struct export_operations efs_export_ops = {
@@ -249,26 +248,26 @@ static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) {
 	return 0;    
 }
 
-static int efs_fill_super(struct super_block *s, void *d, int silent)
+static int efs_fill_super(struct super_block *s, struct fs_context *fc)
 {
 	struct efs_sb_info *sb;
 	struct buffer_head *bh;
 	struct inode *root;
 
- 	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
+	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
 	if (!sb)
 		return -ENOMEM;
 	s->s_fs_info = sb;
 	s->s_time_min = 0;
 	s->s_time_max = U32_MAX;
- 
+
 	s->s_magic		= EFS_SUPER_MAGIC;
 	if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
 		pr_err("device does not support %d byte blocks\n",
 			EFS_BLOCKSIZE);
 		return -EINVAL;
 	}
-  
+
 	/* read the vh (volume header) block */
 	bh = sb_bread(s, 0);
 
@@ -294,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 		pr_err("cannot read superblock\n");
 		return -EIO;
 	}
-		
+
 	if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
 #ifdef DEBUG
 		pr_warn("invalid superblock at block %u\n",
@@ -328,6 +327,61 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 	return 0;
 }
 
+static void efs_free_fc(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+static int efs_get_tree(struct fs_context *fc)
+{
+	return get_tree_bdev(fc, efs_fill_super);
+}
+
+static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	int token;
+	struct fs_parse_result result;
+
+	token = fs_parse(fc, efs_param_spec, param, &result);
+	if (token < 0)
+		return token;
+	return 0;
+}
+
+static int efs_reconfigure(struct fs_context *fc)
+{
+	sync_filesystem(fc->root->d_sb);
+
+	return 0;
+}
+
+struct efs_context {
+	unsigned long s_mount_opts;
+};
+
+static const struct fs_context_operations efs_context_opts = {
+	.parse_param	= efs_parse_param,
+	.get_tree	= efs_get_tree,
+	.reconfigure	= efs_reconfigure,
+	.free		= efs_free_fc,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int efs_init_fs_context(struct fs_context *fc)
+{
+	struct efs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	fc->fs_private = ctx;
+	fc->ops = &efs_context_opts;
+
+	return 0;
+}
+
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
 	struct super_block *sb = dentry->d_sb;
 	struct efs_sb_info *sbi = SUPER_INFO(sb);
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 279933e007d2..7cc5841577b2 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,13 +11,12 @@
 struct z_erofs_decompress_req {
 	struct super_block *sb;
 	struct page **in, **out;
-
 	unsigned short pageofs_in, pageofs_out;
 	unsigned int inputsize, outputsize;
 
-	/* indicate the algorithm will be used for decompression */
-	unsigned int alg;
+	unsigned int alg;       /* the algorithm for decompression */
 	bool inplace_io, partial_decoding, fillgaps;
+	gfp_t gfp;      /* allocation flags for extra temporary buffers */
 };
 
 struct z_erofs_decompressor {
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index c98aeda8abb2..3d9721b3faa8 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -447,5 +447,6 @@ const struct file_operations erofs_file_fops = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= erofs_file_read_iter,
 	.mmap		= erofs_file_mmap,
+	.get_unmapped_area = thp_get_unmapped_area,
 	.splice_read	= filemap_splice_read,
 };
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 072ef6a66823..2ec9b2bb628d 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
 			victim = availables[--top];
 			get_page(victim);
 		} else {
-			victim = erofs_allocpage(pagepool,
-						 GFP_KERNEL | __GFP_NOFAIL);
+			victim = erofs_allocpage(pagepool, rq->gfp);
+			if (!victim)
+				return -ENOMEM;
 			set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
 		}
 		rq->out[i] = victim;
@@ -322,7 +323,8 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
 	unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
 	u8 *kin;
 
-	DBG_BUGON(rq->outputsize > rq->inputsize);
+	if (rq->outputsize > rq->inputsize)
+		return -EOPNOTSUPP;
 	if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) {
 		cur = bs - (rq->pageofs_out & (bs - 1));
 		pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK;
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 4a64a9c91dd3..b98872058abe 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb,
 }
 
 int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
-			       struct page **pagepool)
+			       struct page **pgpl)
 {
 	const unsigned int nrpages_out =
 		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -158,8 +158,12 @@ again:
 			strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
 			outsz -= strm->z.avail_out;
 			if (!rq->out[no]) {
-				rq->out[no] = erofs_allocpage(pagepool,
-						GFP_KERNEL | __GFP_NOFAIL);
+				rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+				if (!rq->out[no]) {
+					kout = NULL;
+					err = -ENOMEM;
+					break;
+				}
 				set_page_private(rq->out[no],
 						 Z_EROFS_SHORTLIVED_PAGE);
 			}
@@ -211,8 +215,11 @@ again:
 
 			DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
 							rq->in[j]));
-			tmppage = erofs_allocpage(pagepool,
-						  GFP_KERNEL | __GFP_NOFAIL);
+			tmppage = erofs_allocpage(pgpl, rq->gfp);
+			if (!tmppage) {
+				err = -ENOMEM;
+				goto failed;
+			}
 			set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
 			copy_highpage(tmppage, rq->in[j]);
 			rq->in[j] = tmppage;
@@ -230,7 +237,7 @@ again:
 			break;
 		}
 	}
-
+failed:
 	if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
 		err = -EIO;
 	if (kout)
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 2dd14f99c1dc..6ca357d83cfa 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -148,7 +148,7 @@ again:
 }
 
 int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
-			    struct page **pagepool)
+			    struct page **pgpl)
 {
 	const unsigned int nrpages_out =
 		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -215,8 +215,11 @@ again:
 						   PAGE_SIZE - pageofs);
 			outlen -= strm->buf.out_size;
 			if (!rq->out[no] && rq->fillgaps) {	/* deduped */
-				rq->out[no] = erofs_allocpage(pagepool,
-						GFP_KERNEL | __GFP_NOFAIL);
+				rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+				if (!rq->out[no]) {
+					err = -ENOMEM;
+					break;
+				}
 				set_page_private(rq->out[no],
 						 Z_EROFS_SHORTLIVED_PAGE);
 			}
@@ -258,8 +261,11 @@ again:
 
 			DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
 							rq->in[j]));
-			tmppage = erofs_allocpage(pagepool,
-						  GFP_KERNEL | __GFP_NOFAIL);
+			tmppage = erofs_allocpage(pgpl, rq->gfp);
+			if (!tmppage) {
+				err = -ENOMEM;
+				goto failed;
+			}
 			set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
 			copy_highpage(tmppage, rq->in[j]);
 			rq->in[j] = tmppage;
@@ -277,6 +283,7 @@ again:
 			break;
 		}
 	}
+failed:
 	if (no < nrpages_out && strm->buf.out)
 		kunmap(rq->out[no]);
 	if (ni < nrpages_in)
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index bc12030393b2..89a7c2453aae 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -381,11 +381,12 @@ static int erofs_fscache_init_domain(struct super_block *sb)
 		goto out;
 
 	if (!erofs_pseudo_mnt) {
-		erofs_pseudo_mnt = kern_mount(&erofs_fs_type);
-		if (IS_ERR(erofs_pseudo_mnt)) {
-			err = PTR_ERR(erofs_pseudo_mnt);
+		struct vfsmount *mnt = kern_mount(&erofs_fs_type);
+		if (IS_ERR(mnt)) {
+			err = PTR_ERR(mnt);
 			goto out;
 		}
+		erofs_pseudo_mnt = mnt;
 	}
 
 	domain->volume = sbi->volume;
@@ -459,7 +460,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb
 
 	inode->i_size = OFFSET_MAX;
 	inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
-	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+	mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
 	inode->i_blkbits = EROFS_SB(sb)->blkszbits;
 	inode->i_private = ctx;
 
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 3d616dea55dc..36e638e8b53a 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 		} else {
 			const unsigned int gotten = sb->s_blocksize - *ofs;
 
-			copied = kmalloc(vi->inode_isize, GFP_NOFS);
+			copied = kmalloc(vi->inode_isize, GFP_KERNEL);
 			if (!copied) {
 				err = -ENOMEM;
 				goto err_out;
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index d4f631d39f0f..f0110a78acb2 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -130,24 +130,24 @@ static void *erofs_find_target_block(struct erofs_buf *target,
 			/* string comparison without already matched prefix */
 			diff = erofs_dirnamecmp(name, &dname, &matched);
 
-			if (!diff) {
-				*_ndirents = 0;
-				goto out;
-			} else if (diff > 0) {
-				head = mid + 1;
-				startprfx = matched;
-
-				if (!IS_ERR(candidate))
-					erofs_put_metabuf(target);
-				*target = buf;
-				candidate = de;
-				*_ndirents = ndirents;
-			} else {
+			if (diff < 0) {
 				erofs_put_metabuf(&buf);
-
 				back = mid - 1;
 				endprfx = matched;
+				continue;
+			}
+
+			if (!IS_ERR(candidate))
+				erofs_put_metabuf(target);
+			*target = buf;
+			if (!diff) {
+				*_ndirents = 0;
+				return de;
 			}
+			head = mid + 1;
+			startprfx = matched;
+			candidate = de;
+			*_ndirents = ndirents;
 			continue;
 		}
 out:		/* free if the candidate is valid */
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 5dea308764b4..e146d09151af 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
 repeat:
 	xa_lock(&sbi->managed_pslots);
 	pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
-			   NULL, grp, GFP_NOFS);
+			   NULL, grp, GFP_KERNEL);
 	if (pre) {
 		if (xa_is_err(pre)) {
 			pre = ERR_PTR(xa_err(pre));
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 692c0c39be63..ff0aa72b0db3 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -82,6 +82,9 @@ struct z_erofs_pcluster {
 	/* L: indicate several pageofs_outs or not */
 	bool multibases;
 
+	/* L: whether extra buffer allocations are best-effort */
+	bool besteffort;
+
 	/* A: compressed bvecs (can be cached or inplaced pages) */
 	struct z_erofs_bvec compressed_bvecs[];
 };
@@ -230,7 +233,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
 		struct page *nextpage = *candidate_bvpage;
 
 		if (!nextpage) {
-			nextpage = erofs_allocpage(pagepool, GFP_NOFS);
+			nextpage = erofs_allocpage(pagepool, GFP_KERNEL);
 			if (!nextpage)
 				return -ENOMEM;
 			set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -302,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
 		if (nrpages > pcs->maxpages)
 			continue;
 
-		pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+		pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
 		if (!pcl)
 			return ERR_PTR(-ENOMEM);
 		pcl->pclustersize = size;
@@ -563,21 +566,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 			__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 	unsigned int i;
 
-	if (i_blocksize(fe->inode) != PAGE_SIZE)
-		return;
-	if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+	if (i_blocksize(fe->inode) != PAGE_SIZE ||
+	    fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
 		return;
 
 	for (i = 0; i < pclusterpages; ++i) {
 		struct page *page, *newpage;
 		void *t;	/* mark pages just found for debugging */
 
-		/* the compressed page was loaded before */
+		/* Inaccurate check w/o locking to avoid unneeded lookups */
 		if (READ_ONCE(pcl->compressed_bvecs[i].page))
 			continue;
 
 		page = find_get_page(mc, pcl->obj.index + i);
-
 		if (page) {
 			t = (void *)((unsigned long)page | 1);
 			newpage = NULL;
@@ -597,9 +598,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 			set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
 			t = (void *)((unsigned long)newpage | 1);
 		}
-
-		if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
+		spin_lock(&pcl->obj.lockref.lock);
+		if (!pcl->compressed_bvecs[i].page) {
+			pcl->compressed_bvecs[i].page = t;
+			spin_unlock(&pcl->obj.lockref.lock);
 			continue;
+		}
+		spin_unlock(&pcl->obj.lockref.lock);
 
 		if (page)
 			put_page(page);
@@ -694,7 +699,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio,
 	DBG_BUGON(stop > folio_size(folio) || stop < length);
 
 	if (offset == 0 && stop == folio_size(folio))
-		while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+		while (!z_erofs_cache_release_folio(folio, 0))
 			cond_resched();
 }
 
@@ -713,36 +718,30 @@ int erofs_init_managed_cache(struct super_block *sb)
 	set_nlink(inode, 1);
 	inode->i_size = OFFSET_MAX;
 	inode->i_mapping->a_ops = &z_erofs_cache_aops;
-	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+	mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
 	EROFS_SB(sb)->managed_cache = inode;
 	return 0;
 }
 
-static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
-				   struct z_erofs_bvec *bvec)
-{
-	struct z_erofs_pcluster *const pcl = fe->pcl;
-
-	while (fe->icur > 0) {
-		if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
-			     NULL, bvec->page)) {
-			pcl->compressed_bvecs[fe->icur] = *bvec;
-			return true;
-		}
-	}
-	return false;
-}
-
 /* callers must be with pcluster lock held */
 static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
 			       struct z_erofs_bvec *bvec, bool exclusive)
 {
+	struct z_erofs_pcluster *pcl = fe->pcl;
 	int ret;
 
 	if (exclusive) {
 		/* give priority for inplaceio to use file pages first */
-		if (z_erofs_try_inplace_io(fe, bvec))
+		spin_lock(&pcl->obj.lockref.lock);
+		while (fe->icur > 0) {
+			if (pcl->compressed_bvecs[--fe->icur].page)
+				continue;
+			pcl->compressed_bvecs[fe->icur] = *bvec;
+			spin_unlock(&pcl->obj.lockref.lock);
 			return 0;
+		}
+		spin_unlock(&pcl->obj.lockref.lock);
+
 		/* otherwise, check if it can be used as a bvpage */
 		if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
 		    !fe->candidate_bvpage)
@@ -964,7 +963,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
 }
 
 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
-				struct page *page)
+				struct page *page, bool ra)
 {
 	struct inode *const inode = fe->inode;
 	struct erofs_map_blocks *const map = &fe->map;
@@ -1014,6 +1013,7 @@ repeat:
 		err = z_erofs_pcluster_begin(fe);
 		if (err)
 			goto out;
+		fe->pcl->besteffort |= !ra;
 	}
 
 	/*
@@ -1280,6 +1280,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 					.inplace_io = overlapped,
 					.partial_decoding = pcl->partial,
 					.fillgaps = pcl->multibases,
+					.gfp = pcl->besteffort ?
+						GFP_KERNEL | __GFP_NOFAIL :
+						GFP_NOWAIT | __GFP_NORETRY
 				 }, be->pagepool);
 
 	/* must handle all compressed pages before actual file pages */
@@ -1322,6 +1325,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 	pcl->length = 0;
 	pcl->partial = true;
 	pcl->multibases = false;
+	pcl->besteffort = false;
 	pcl->bvset.nextpage = NULL;
 	pcl->vcnt = 0;
 
@@ -1423,23 +1427,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
 {
 	gfp_t gfp = mapping_gfp_mask(mc);
 	bool tocache = false;
-	struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
+	struct z_erofs_bvec zbv;
 	struct address_space *mapping;
-	struct page *page, *oldpage;
+	struct page *page;
 	int justfound, bs = i_blocksize(f->inode);
 
 	/* Except for inplace pages, the entire page can be used for I/Os */
 	bvec->bv_offset = 0;
 	bvec->bv_len = PAGE_SIZE;
 repeat:
-	oldpage = READ_ONCE(zbv->page);
-	if (!oldpage)
+	spin_lock(&pcl->obj.lockref.lock);
+	zbv = pcl->compressed_bvecs[nr];
+	page = zbv.page;
+	justfound = (unsigned long)page & 1UL;
+	page = (struct page *)((unsigned long)page & ~1UL);
+	pcl->compressed_bvecs[nr].page = page;
+	spin_unlock(&pcl->obj.lockref.lock);
+	if (!page)
 		goto out_allocpage;
 
-	justfound = (unsigned long)oldpage & 1UL;
-	page = (struct page *)((unsigned long)oldpage & ~1UL);
 	bvec->bv_page = page;
-
 	DBG_BUGON(z_erofs_is_shortlived_page(page));
 	/*
 	 * Handle preallocated cached pages.  We tried to allocate such pages
@@ -1448,7 +1455,6 @@ repeat:
 	 */
 	if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
 		set_page_private(page, 0);
-		WRITE_ONCE(zbv->page, page);
 		tocache = true;
 		goto out_tocache;
 	}
@@ -1459,9 +1465,9 @@ repeat:
 	 * therefore it is impossible for `mapping` to be NULL.
 	 */
 	if (mapping && mapping != mc) {
-		if (zbv->offset < 0)
-			bvec->bv_offset = round_up(-zbv->offset, bs);
-		bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
+		if (zbv.offset < 0)
+			bvec->bv_offset = round_up(-zbv.offset, bs);
+		bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
 		return;
 	}
 
@@ -1471,7 +1477,6 @@ repeat:
 
 	/* the cached page is still in managed cache */
 	if (page->mapping == mc) {
-		WRITE_ONCE(zbv->page, page);
 		/*
 		 * The cached page is still available but without a valid
 		 * `->private` pcluster hint.  Let's reconnect them.
@@ -1503,11 +1508,15 @@ repeat:
 	put_page(page);
 out_allocpage:
 	page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
-	if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
+	spin_lock(&pcl->obj.lockref.lock);
+	if (pcl->compressed_bvecs[nr].page) {
 		erofs_pagepool_add(&f->pagepool, page);
+		spin_unlock(&pcl->obj.lockref.lock);
 		cond_resched();
 		goto repeat;
 	}
+	pcl->compressed_bvecs[nr].page = page;
+	spin_unlock(&pcl->obj.lockref.lock);
 	bvec->bv_page = page;
 out_tocache:
 	if (!tocache || bs != PAGE_SIZE ||
@@ -1685,6 +1694,7 @@ submit_bio_retry:
 
 			if (cur + bvec.bv_len > end)
 				bvec.bv_len = end - cur;
+			DBG_BUGON(bvec.bv_len < sb->s_blocksize);
 			if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
 					  bvec.bv_offset))
 				goto submit_bio_retry;
@@ -1785,7 +1795,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
 			if (PageUptodate(page))
 				unlock_page(page);
 			else
-				(void)z_erofs_do_read_page(f, page);
+				(void)z_erofs_do_read_page(f, page, !!rac);
 			put_page(page);
 		}
 
@@ -1806,7 +1816,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
 	f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
 
 	z_erofs_pcluster_readmore(&f, NULL, true);
-	err = z_erofs_do_read_page(&f, &folio->page);
+	err = z_erofs_do_read_page(&f, &folio->page, false);
 	z_erofs_pcluster_readmore(&f, NULL, false);
 	z_erofs_pcluster_end(&f);
 
@@ -1847,7 +1857,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
 		folio = head;
 		head = folio_get_private(folio);
 
-		err = z_erofs_do_read_page(&f, &folio->page);
+		err = z_erofs_do_read_page(&f, &folio->page, true);
 		if (err && err != -EINTR)
 			erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
 				  folio->index, EROFS_I(inode)->nid);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ad8186d47ba7..9afdb722fa92 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -251,7 +251,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	ssize_t res;
 	__u64 ucnt;
 
-	if (count < sizeof(ucnt))
+	if (count != sizeof(ucnt))
 		return -EINVAL;
 	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
 		return -EFAULT;
@@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct eventfd_ctx *ctx = f->private_data;
+	__u64 cnt;
 
 	spin_lock_irq(&ctx->wqh.lock);
-	seq_printf(m, "eventfd-count: %16llx\n",
-		   (unsigned long long)ctx->count);
+	cnt = ctx->count;
 	spin_unlock_irq(&ctx->wqh.lock);
-	seq_printf(m, "eventfd-id: %d\n", ctx->id);
-	seq_printf(m, "eventfd-semaphore: %d\n",
+
+	seq_printf(m,
+		   "eventfd-count: %16llx\n"
+		   "eventfd-id: %d\n"
+		   "eventfd-semaphore: %d\n",
+		   cnt,
+		   ctx->id,
 		   !!(ctx->flags & EFD_SEMAPHORE));
 }
 #endif
@@ -383,6 +388,7 @@ static int do_eventfd(unsigned int count, int flags)
 	/* Check the EFD_* constants for consistency.  */
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+	BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0));
 
 	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3534d36a1474..39ac6fdf8bca 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -206,7 +206,7 @@ struct eventpoll {
 	 */
 	struct epitem *ovflist;
 
-	/* wakeup_source used when ep_scan_ready_list is running */
+	/* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
 	struct wakeup_source *ws;
 
 	/* The user that created the eventpoll descriptor */
@@ -678,12 +678,6 @@ static void ep_done_scan(struct eventpoll *ep,
 	write_unlock_irq(&ep->lock);
 }
 
-static void epi_rcu_free(struct rcu_head *head)
-{
-	struct epitem *epi = container_of(head, struct epitem, rcu);
-	kmem_cache_free(epi_cache, epi);
-}
-
 static void ep_get(struct eventpoll *ep)
 {
 	refcount_inc(&ep->refcount);
@@ -767,7 +761,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 	 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
 	 * use of the rbn field.
 	 */
-	call_rcu(&epi->rcu, epi_rcu_free);
+	kfree_rcu(epi, rcu);
 
 	percpu_counter_dec(&ep->user->epoll_watches);
 	return ep_refcount_dec_and_test(ep);
@@ -1153,7 +1147,7 @@ static inline bool chain_epi_lockless(struct epitem *epi)
  * This callback takes a read lock in order not to contend with concurrent
  * events from another file descriptor, thus all modifications to ->rdllist
  * or ->ovflist are lockless.  Read lock is paired with the write lock from
- * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * ep_start/done_scan(), which stops all list modifications and guarantees
  * that lists state is seen correctly.
  *
  * Another thing worth to mention is that ep_poll_callback() can be called
@@ -1751,7 +1745,7 @@ static int ep_send_events(struct eventpoll *ep,
 			 * availability. At this point, no one can insert
 			 * into ep->rdllist besides us. The epoll_ctl()
 			 * callers are locked out by
-			 * ep_scan_ready_list() holding "mtx" and the
+			 * ep_send_events() holding "mtx" and the
 			 * poll callback will queue them in ep->ovflist.
 			 */
 			list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1904,7 +1898,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		__set_current_state(TASK_INTERRUPTIBLE);
 
 		/*
-		 * Do the final check under the lock. ep_scan_ready_list()
+		 * Do the final check under the lock. ep_start/done_scan()
 		 * plays with two lists (->rdllist and ->ovflist) and there
 		 * is always a race when both lists are empty for short
 		 * period of time although events are pending, so lock is
diff --git a/fs/exec.c b/fs/exec.c
index 8cdd5b2dd09c..ece3ab0998e1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -128,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 	struct filename *tmp = getname(library);
 	int error = PTR_ERR(tmp);
 	static const struct open_flags uselib_flags = {
-		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.open_flag = O_LARGEFILE | O_RDONLY,
 		.acc_mode = MAY_READ | MAY_EXEC,
 		.intent = LOOKUP_OPEN,
 		.lookup_flags = LOOKUP_FOLLOW,
@@ -904,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack);
 
 #endif /* CONFIG_MMU */
 
+/*
+ * On success, caller must call do_close_execat() on the returned
+ * struct file to close it.
+ */
 static struct file *do_open_execat(int fd, struct filename *name, int flags)
 {
 	struct file *file;
@@ -948,6 +952,17 @@ exit:
 	return ERR_PTR(err);
 }
 
+/**
+ * open_exec - Open a path name for execution
+ *
+ * @name: path name to open with the intent of executing it.
+ *
+ * Returns ERR_PTR on failure or allocated struct file on success.
+ *
+ * As this is a wrapper for the internal do_open_execat(), callers
+ * must call allow_write_access() before fput() on release. Also see
+ * do_close_execat().
+ */
 struct file *open_exec(const char *name)
 {
 	struct filename *filename = getname_kernel(name);
@@ -1143,7 +1158,6 @@ static int de_thread(struct task_struct *tsk)
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 		leader->exit_state = EXIT_DEAD;
-
 		/*
 		 * We are going to release_task()->ptrace_unlink() silently,
 		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
@@ -1409,6 +1423,9 @@ int begin_new_exec(struct linux_binprm * bprm)
 
 out_unlock:
 	up_write(&me->signal->exec_update_lock);
+	if (!bprm->cred)
+		mutex_unlock(&me->signal->cred_guard_mutex);
+
 out:
 	return retval;
 }
@@ -1484,6 +1501,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
 	return -ENOMEM;
 }
 
+/* Matches do_open_execat() */
+static void do_close_execat(struct file *file)
+{
+	if (!file)
+		return;
+	allow_write_access(file);
+	fput(file);
+}
+
 static void free_bprm(struct linux_binprm *bprm)
 {
 	if (bprm->mm) {
@@ -1495,10 +1521,7 @@ static void free_bprm(struct linux_binprm *bprm)
 		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
+	do_close_execat(bprm->file);
 	if (bprm->executable)
 		fput(bprm->executable);
 	/* If a binfmt changed the interp, free it. */
@@ -1520,8 +1543,7 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl
 
 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
 	if (!bprm) {
-		allow_write_access(file);
-		fput(file);
+		do_close_execat(file);
 		return ERR_PTR(-ENOMEM);
 	}
 
@@ -1610,6 +1632,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	}
 	rcu_read_unlock();
 
+	/* "users" and "in_exec" locked for copy_fs() */
 	if (p->fs->users > n_fs)
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
 	else
@@ -1826,9 +1849,6 @@ static int exec_binprm(struct linux_binprm *bprm)
 	return 0;
 }
 
-/*
- * sys_execve() executes a new program.
- */
 static int bprm_execve(struct linux_binprm *bprm)
 {
 	int retval;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 9474cd50da6d..361595433480 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -275,6 +275,7 @@ struct exfat_sb_info {
 
 	spinlock_t inode_hash_lock;
 	struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
+	struct rcu_head rcu;
 };
 
 #define EXFAT_CACHE_VALID	0
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index d25a96a148af..cc00f1a7a1e1 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -35,13 +35,18 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
 	if (new_num_clusters == num_clusters)
 		goto out;
 
-	exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags);
-	ret = exfat_find_last_cluster(sb, &clu, &last_clu);
-	if (ret)
-		return ret;
+	if (num_clusters) {
+		exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags);
+		ret = exfat_find_last_cluster(sb, &clu, &last_clu);
+		if (ret)
+			return ret;
+
+		clu.dir = last_clu + 1;
+	} else {
+		last_clu = EXFAT_EOF_CLUSTER;
+		clu.dir = EXFAT_EOF_CLUSTER;
+	}
 
-	clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ?
-			EXFAT_EOF_CLUSTER : last_clu + 1;
 	clu.size = 0;
 	clu.flags = ei->flags;
 
@@ -51,17 +56,19 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
 		return ret;
 
 	/* Append new clusters to chain */
-	if (clu.flags != ei->flags) {
-		exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters);
-		ei->flags = ALLOC_FAT_CHAIN;
-	}
-	if (clu.flags == ALLOC_FAT_CHAIN)
-		if (exfat_ent_set(sb, last_clu, clu.dir))
-			goto free_clu;
-
-	if (num_clusters == 0)
+	if (num_clusters) {
+		if (clu.flags != ei->flags)
+			if (exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters))
+				goto free_clu;
+
+		if (clu.flags == ALLOC_FAT_CHAIN)
+			if (exfat_ent_set(sb, last_clu, clu.dir))
+				goto free_clu;
+	} else
 		ei->start_clu = clu.dir;
 
+	ei->flags = clu.flags;
+
 out:
 	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	/* Expanded range not zeroed, do not update valid_size */
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 522edcbb2ce4..0687f952956c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -501,7 +501,7 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	struct inode *inode = mapping->host;
 	struct exfat_inode_info *ei = EXFAT_I(inode);
 	loff_t pos = iocb->ki_pos;
-	loff_t size = iocb->ki_pos + iov_iter_count(iter);
+	loff_t size = pos + iov_iter_count(iter);
 	int rw = iov_iter_rw(iter);
 	ssize_t ret;
 
@@ -525,11 +525,10 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	 */
 	ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block);
 	if (ret < 0) {
-		if (rw == WRITE)
+		if (rw == WRITE && ret != -EIOCBQUEUED)
 			exfat_write_failed(mapping, size);
 
-		if (ret != -EIOCBQUEUED)
-			return ret;
+		return ret;
 	} else
 		size = pos + ret;
 
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 705710f93e2d..afdf13c34ff5 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -655,7 +655,6 @@ static int exfat_load_upcase_table(struct super_block *sb,
 	unsigned int sect_size = sb->s_blocksize;
 	unsigned int i, index = 0;
 	u32 chksum = 0;
-	int ret;
 	unsigned char skip = false;
 	unsigned short *upcase_table;
 
@@ -673,8 +672,7 @@ static int exfat_load_upcase_table(struct super_block *sb,
 		if (!bh) {
 			exfat_err(sb, "failed to read sector(0x%llx)",
 				  (unsigned long long)sector);
-			ret = -EIO;
-			goto free_table;
+			return -EIO;
 		}
 		sector++;
 		for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) {
@@ -701,15 +699,12 @@ static int exfat_load_upcase_table(struct super_block *sb,
 
 	exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)",
 		  index, chksum, utbl_checksum);
-	ret = -EINVAL;
-free_table:
-	exfat_free_upcase_table(sbi);
-	return ret;
+	return -EINVAL;
 }
 
 static int exfat_load_default_upcase_table(struct super_block *sb)
 {
-	int i, ret = -EIO;
+	int i;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
 	unsigned char skip = false;
 	unsigned short uni = 0, *upcase_table;
@@ -740,8 +735,7 @@ static int exfat_load_default_upcase_table(struct super_block *sb)
 		return 0;
 
 	/* FATAL error: default upcase table has error */
-	exfat_free_upcase_table(sbi);
-	return ret;
+	return -EIO;
 }
 
 int exfat_create_upcase_table(struct super_block *sb)
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index d9d4fa91010b..fcb658267765 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -39,9 +39,6 @@ static void exfat_put_super(struct super_block *sb)
 	exfat_free_bitmap(sbi);
 	brelse(sbi->boot_bh);
 	mutex_unlock(&sbi->s_lock);
-
-	unload_nls(sbi->nls_io);
-	exfat_free_upcase_table(sbi);
 }
 
 static int exfat_sync_fs(struct super_block *sb, int wait)
@@ -600,7 +597,7 @@ static int __exfat_fill_super(struct super_block *sb)
 	ret = exfat_load_bitmap(sb);
 	if (ret) {
 		exfat_err(sb, "failed to load alloc-bitmap");
-		goto free_upcase_table;
+		goto free_bh;
 	}
 
 	ret = exfat_count_used_clusters(sb, &sbi->used_clusters);
@@ -613,8 +610,6 @@ static int __exfat_fill_super(struct super_block *sb)
 
 free_alloc_bitmap:
 	exfat_free_bitmap(sbi);
-free_upcase_table:
-	exfat_free_upcase_table(sbi);
 free_bh:
 	brelse(sbi->boot_bh);
 	return ret;
@@ -701,12 +696,10 @@ put_inode:
 	sb->s_root = NULL;
 
 free_table:
-	exfat_free_upcase_table(sbi);
 	exfat_free_bitmap(sbi);
 	brelse(sbi->boot_bh);
 
 check_nls_io:
-	unload_nls(sbi->nls_io);
 	return err;
 }
 
@@ -771,13 +764,22 @@ static int exfat_init_fs_context(struct fs_context *fc)
 	return 0;
 }
 
+static void delayed_free(struct rcu_head *p)
+{
+	struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu);
+
+	unload_nls(sbi->nls_io);
+	exfat_free_upcase_table(sbi);
+	exfat_free_sbi(sbi);
+}
+
 static void exfat_kill_sb(struct super_block *sb)
 {
 	struct exfat_sb_info *sbi = sb->s_fs_info;
 
 	kill_block_super(sb);
 	if (sbi)
-		exfat_free_sbi(sbi);
+		call_rcu(&sbi->rcu, delayed_free);
 }
 
 static struct file_system_type exfat_fs_type = {
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 3ae0154c5680..07ea3d62b298 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -255,7 +255,7 @@ static bool filldir_one(struct dir_context *ctx, const char *name, int len,
 		container_of(ctx, struct getdents_callback, ctx);
 
 	buf->sequence++;
-	if (buf->ino == ino && len <= NAME_MAX) {
+	if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) {
 		memcpy(buf->name, name, len);
 		buf->name[len] = '\0';
 		buf->found = 1;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a5d784872303..023571f8dd1b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -252,8 +252,10 @@ struct ext4_allocation_request {
 #define EXT4_MAP_MAPPED		BIT(BH_Mapped)
 #define EXT4_MAP_UNWRITTEN	BIT(BH_Unwritten)
 #define EXT4_MAP_BOUNDARY	BIT(BH_Boundary)
+#define EXT4_MAP_DELAYED	BIT(BH_Delay)
 #define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
-				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
+				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+				 EXT4_MAP_DELAYED)
 
 struct ext4_map_blocks {
 	ext4_fsblk_t m_pblk;
@@ -2912,10 +2914,10 @@ extern const struct seq_operations ext4_mb_seq_groups_ops;
 extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
 extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
 extern int ext4_mb_init(struct super_block *);
-extern int ext4_mb_release(struct super_block *);
+extern void ext4_mb_release(struct super_block *);
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
 				struct ext4_allocation_request *, int *);
-extern void ext4_discard_preallocations(struct inode *, unsigned int);
+extern void ext4_discard_preallocations(struct inode *);
 extern int __init ext4_init_mballoc(void);
 extern void ext4_exit_mballoc(void);
 extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 01299b55a567..7669d154c05e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
 	 * i_rwsem. So we can safely drop the i_data_sem here.
 	 */
 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
-	ext4_discard_preallocations(inode, 0);
+	ext4_discard_preallocations(inode);
 	up_write(&EXT4_I(inode)->i_data_sem);
 	*dropped = 1;
 	return 0;
@@ -2229,7 +2229,7 @@ static int ext4_fill_es_cache_info(struct inode *inode,
 
 
 /*
- * ext4_ext_determine_hole - determine hole around given block
+ * ext4_ext_find_hole - find hole around given block according to the given path
  * @inode:	inode we lookup in
  * @path:	path in extent tree to @lblk
  * @lblk:	pointer to logical block around which we want to determine hole
@@ -2241,9 +2241,9 @@ static int ext4_fill_es_cache_info(struct inode *inode,
  * The function returns the length of a hole starting at @lblk. We update @lblk
  * to the beginning of the hole if we managed to find it.
  */
-static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
-					   struct ext4_ext_path *path,
-					   ext4_lblk_t *lblk)
+static ext4_lblk_t ext4_ext_find_hole(struct inode *inode,
+				      struct ext4_ext_path *path,
+				      ext4_lblk_t *lblk)
 {
 	int depth = ext_depth(inode);
 	struct ext4_extent *ex;
@@ -2271,30 +2271,6 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
 }
 
 /*
- * ext4_ext_put_gap_in_cache:
- * calculate boundaries of the gap that the requested block fits into
- * and cache this gap
- */
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
-			  ext4_lblk_t hole_len)
-{
-	struct extent_status es;
-
-	ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
-				  hole_start + hole_len - 1, &es);
-	if (es.es_len) {
-		/* There's delayed extent containing lblock? */
-		if (es.es_lblk <= hole_start)
-			return;
-		hole_len = min(es.es_lblk - hole_start, hole_len);
-	}
-	ext_debug(inode, " -> %u:%u\n", hole_start, hole_len);
-	ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
-			      EXTENT_STATUS_HOLE);
-}
-
-/*
  * ext4_ext_rm_idx:
  * removes index from the index block.
  */
@@ -4062,6 +4038,72 @@ static int get_implied_cluster_alloc(struct super_block *sb,
 	return 0;
 }
 
+/*
+ * Determine hole length around the given logical block, first try to
+ * locate and expand the hole from the given @path, and then adjust it
+ * if it's partially or completely converted to delayed extents, insert
+ * it into the extent cache tree if it's indeed a hole, finally return
+ * the length of the determined extent.
+ */
+static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
+						  struct ext4_ext_path *path,
+						  ext4_lblk_t lblk)
+{
+	ext4_lblk_t hole_start, len;
+	struct extent_status es;
+
+	hole_start = lblk;
+	len = ext4_ext_find_hole(inode, path, &hole_start);
+again:
+	ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
+				  hole_start + len - 1, &es);
+	if (!es.es_len)
+		goto insert_hole;
+
+	/*
+	 * There's a delalloc extent in the hole, handle it if the delalloc
+	 * extent is in front of, behind and straddle the queried range.
+	 */
+	if (lblk >= es.es_lblk + es.es_len) {
+		/*
+		 * The delalloc extent is in front of the queried range,
+		 * find again from the queried start block.
+		 */
+		len -= lblk - hole_start;
+		hole_start = lblk;
+		goto again;
+	} else if (in_range(lblk, es.es_lblk, es.es_len)) {
+		/*
+		 * The delalloc extent containing lblk, it must have been
+		 * added after ext4_map_blocks() checked the extent status
+		 * tree so we are not holding i_rwsem and delalloc info is
+		 * only stabilized by i_data_sem we are going to release
+		 * soon. Don't modify the extent status tree and report
+		 * extent as a hole, just adjust the length to the delalloc
+		 * extent's after lblk.
+		 */
+		len = es.es_lblk + es.es_len - lblk;
+		return len;
+	} else {
+		/*
+		 * The delalloc extent is partially or completely behind
+		 * the queried range, update hole length until the
+		 * beginning of the delalloc extent.
+		 */
+		len = min(es.es_lblk - hole_start, len);
+	}
+
+insert_hole:
+	/* Put just found gap into cache to speed up subsequent requests */
+	ext_debug(inode, " -> %u:%u\n", hole_start, len);
+	ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);
+
+	/* Update hole_len to reflect hole size after lblk */
+	if (hole_start != lblk)
+		len -= lblk - hole_start;
+
+	return len;
+}
 
 /*
  * Block allocation/map/preallocation routine for extents based files
@@ -4179,22 +4221,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 * we couldn't try to create block if create flag is zero
 	 */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
-		ext4_lblk_t hole_start, hole_len;
+		ext4_lblk_t len;
 
-		hole_start = map->m_lblk;
-		hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
-		/*
-		 * put just found gap into cache to speed up
-		 * subsequent requests
-		 */
-		ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
+		len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk);
 
-		/* Update hole_len to reflect hole size after map->m_lblk */
-		if (hole_start != map->m_lblk)
-			hole_len -= map->m_lblk - hole_start;
 		map->m_pblk = 0;
-		map->m_len = min_t(unsigned int, map->m_len, hole_len);
-
+		map->m_len = min_t(unsigned int, map->m_len, len);
 		goto out;
 	}
 
@@ -4313,7 +4345,7 @@ got_allocated_blocks:
 			 * not a good idea to call discard here directly,
 			 * but otherwise we'd need to call it every free().
 			 */
-			ext4_discard_preallocations(inode, 0);
+			ext4_discard_preallocations(inode);
 			if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 				fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
 			ext4_free_blocks(handle, inode, NULL, newblock,
@@ -5357,7 +5389,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
 	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
 
 	down_write(&EXT4_I(inode)->i_data_sem);
-	ext4_discard_preallocations(inode, 0);
+	ext4_discard_preallocations(inode);
 	ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
 
 	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
@@ -5365,7 +5397,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto out_stop;
 	}
-	ext4_discard_preallocations(inode, 0);
+	ext4_discard_preallocations(inode);
 
 	ret = ext4_ext_shift_extents(inode, handle, punch_stop,
 				     punch_stop - punch_start, SHIFT_LEFT);
@@ -5497,7 +5529,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
 		goto out_stop;
 
 	down_write(&EXT4_I(inode)->i_data_sem);
-	ext4_discard_preallocations(inode, 0);
+	ext4_discard_preallocations(inode);
 
 	path = ext4_find_extent(inode, offset_lblk, NULL, 0);
 	if (IS_ERR(path)) {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6aa15dafc677..54d6ff22585c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -174,7 +174,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
 			(atomic_read(&inode->i_writecount) == 1) &&
 			!EXT4_I(inode)->i_reserved_data_blocks) {
 		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_discard_preallocations(inode, 0);
+		ext4_discard_preallocations(inode);
 		up_write(&EXT4_I(inode)->i_data_sem);
 	}
 	if (is_dx(inode) && filp->private_data)
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index a9f3716119d3..d8ca7f64f952 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -714,7 +714,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
 	 * i_rwsem. So we can safely drop the i_data_sem here.
 	 */
 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
-	ext4_discard_preallocations(inode, 0);
+	ext4_discard_preallocations(inode);
 	up_write(&EXT4_I(inode)->i_data_sem);
 	*dropped = 1;
 	return 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5af1b0b8680e..2ccf3b5e3a7c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,7 +371,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
 	 */
 	if ((ei->i_reserved_data_blocks == 0) &&
 	    !inode_is_open_for_write(inode))
-		ext4_discard_preallocations(inode, 0);
+		ext4_discard_preallocations(inode);
 }
 
 static int __check_block_validity(struct inode *inode, const char *func,
@@ -515,6 +515,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			map->m_len = retval;
 		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
 			map->m_pblk = 0;
+			map->m_flags |= ext4_es_is_delayed(&es) ?
+					EXT4_MAP_DELAYED : 0;
 			retval = es.es_len - (map->m_lblk - es.es_lblk);
 			if (retval > map->m_len)
 				retval = map->m_len;
@@ -1703,11 +1705,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
-		if (ext4_es_is_hole(&es)) {
-			retval = 0;
-			down_read(&EXT4_I(inode)->i_data_sem);
+		if (ext4_es_is_hole(&es))
 			goto add_delayed;
-		}
 
 		/*
 		 * Delayed extent could be allocated by fallocate.
@@ -1749,26 +1748,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
 	else
 		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
-
-add_delayed:
-	if (retval == 0) {
-		int ret;
-
-		/*
-		 * XXX: __block_prepare_write() unmaps passed block,
-		 * is it OK?
-		 */
-
-		ret = ext4_insert_delayed_block(inode, map->m_lblk);
-		if (ret != 0) {
-			retval = ret;
-			goto out_unlock;
-		}
-
-		map_bh(bh, inode->i_sb, invalid_block);
-		set_buffer_new(bh);
-		set_buffer_delay(bh);
-	} else if (retval > 0) {
+	if (retval < 0) {
+		up_read(&EXT4_I(inode)->i_data_sem);
+		return retval;
+	}
+	if (retval > 0) {
 		unsigned int status;
 
 		if (unlikely(retval != map->m_len)) {
@@ -1783,11 +1767,21 @@ add_delayed:
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 		ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 				      map->m_pblk, status);
+		up_read(&EXT4_I(inode)->i_data_sem);
+		return retval;
 	}
+	up_read(&EXT4_I(inode)->i_data_sem);
 
-out_unlock:
-	up_read((&EXT4_I(inode)->i_data_sem));
+add_delayed:
+	down_write(&EXT4_I(inode)->i_data_sem);
+	retval = ext4_insert_delayed_block(inode, map->m_lblk);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	if (retval)
+		return retval;
 
+	map_bh(bh, inode->i_sb, invalid_block);
+	set_buffer_new(bh);
+	set_buffer_delay(bh);
 	return retval;
 }
 
@@ -3268,6 +3262,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
 		iomap->addr = (u64) map->m_pblk << blkbits;
 		if (flags & IOMAP_DAX)
 			iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
+	} else if (map->m_flags & EXT4_MAP_DELAYED) {
+		iomap->type = IOMAP_DELALLOC;
+		iomap->addr = IOMAP_NULL_ADDR;
 	} else {
 		iomap->type = IOMAP_HOLE;
 		iomap->addr = IOMAP_NULL_ADDR;
@@ -3430,35 +3427,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = {
 	.iomap_end		= ext4_iomap_end,
 };
 
-static bool ext4_iomap_is_delalloc(struct inode *inode,
-				   struct ext4_map_blocks *map)
-{
-	struct extent_status es;
-	ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
-
-	ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
-				  map->m_lblk, end, &es);
-
-	if (!es.es_len || es.es_lblk > end)
-		return false;
-
-	if (es.es_lblk > map->m_lblk) {
-		map->m_len = es.es_lblk - map->m_lblk;
-		return false;
-	}
-
-	offset = map->m_lblk - es.es_lblk;
-	map->m_len = es.es_len - offset;
-
-	return true;
-}
-
 static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
 				   loff_t length, unsigned int flags,
 				   struct iomap *iomap, struct iomap *srcmap)
 {
 	int ret;
-	bool delalloc = false;
 	struct ext4_map_blocks map;
 	u8 blkbits = inode->i_blkbits;
 
@@ -3499,13 +3472,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
 	ret = ext4_map_blocks(NULL, inode, &map, 0);
 	if (ret < 0)
 		return ret;
-	if (ret == 0)
-		delalloc = ext4_iomap_is_delalloc(inode, &map);
-
 set_iomap:
 	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
-	if (delalloc && iomap->type == IOMAP_HOLE)
-		iomap->type = IOMAP_DELALLOC;
 
 	return 0;
 }
@@ -4015,12 +3983,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 
 	/* If there are blocks to remove, do it */
 	if (stop_block > first_block) {
+		ext4_lblk_t hole_len = stop_block - first_block;
 
 		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_discard_preallocations(inode, 0);
+		ext4_discard_preallocations(inode);
 
-		ext4_es_remove_extent(inode, first_block,
-				      stop_block - first_block);
+		ext4_es_remove_extent(inode, first_block, hole_len);
 
 		if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 			ret = ext4_ext_remove_space(inode, first_block,
@@ -4029,6 +3997,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 			ret = ext4_ind_remove_space(handle, inode, first_block,
 						    stop_block);
 
+		ext4_es_insert_extent(inode, first_block, hole_len, ~0,
+				      EXTENT_STATUS_HOLE);
 		up_write(&EXT4_I(inode)->i_data_sem);
 	}
 	ext4_fc_track_range(handle, inode, first_block, stop_block);
@@ -4170,7 +4140,7 @@ int ext4_truncate(struct inode *inode)
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 
-	ext4_discard_preallocations(inode, 0);
+	ext4_discard_preallocations(inode);
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		err = ext4_ext_truncate(handle, inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index aa6be510eb8f..7160a71044c8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -467,7 +467,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	ext4_reset_inode_seed(inode);
 	ext4_reset_inode_seed(inode_bl);
 
-	ext4_discard_preallocations(inode, 0);
+	ext4_discard_preallocations(inode);
 
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err < 0) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f44f668e407f..e4f7cf9d89c4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -564,14 +564,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 
 			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 			blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
+			ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 			ext4_grp_locked_error(sb, e4b->bd_group,
 					      inode ? inode->i_ino : 0,
 					      blocknr,
 					      "freeing block already freed "
 					      "(bit %u)",
 					      first + i);
-			ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
-					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 		}
 		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
 	}
@@ -677,7 +677,7 @@ do {									\
 	}								\
 } while (0)
 
-static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 				const char *function, int line)
 {
 	struct super_block *sb = e4b->bd_sb;
@@ -696,7 +696,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 	void *buddy2;
 
 	if (e4b->bd_info->bb_check_counter++ % 10)
-		return 0;
+		return;
 
 	while (order > 1) {
 		buddy = mb_find_buddy(e4b, order, &max);
@@ -758,7 +758,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 
 	grp = ext4_get_group_info(sb, e4b->bd_group);
 	if (!grp)
-		return NULL;
+		return;
 	list_for_each(cur, &grp->bb_prealloc_list) {
 		ext4_group_t groupnr;
 		struct ext4_prealloc_space *pa;
@@ -768,7 +768,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 		for (i = 0; i < pa->pa_len; i++)
 			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
 	}
-	return 0;
 }
 #undef MB_CHECK_ASSERT
 #define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
@@ -842,7 +841,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int new_order;
 
-	if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
+	if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0)
 		return;
 
 	new_order = mb_avg_fragment_size_order(sb,
@@ -871,7 +870,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
  * cr level needs an update.
  */
 static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
-			enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+			enum criteria *new_cr, ext4_group_t *group)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_group_info *iter;
@@ -945,7 +944,7 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o
  * order. Updates *new_cr if cr level needs an update.
  */
 static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
-		enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+		enum criteria *new_cr, ext4_group_t *group)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_group_info *grp = NULL;
@@ -990,7 +989,7 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
  * much and fall to CR_GOAL_LEN_SLOW in that case.
  */
 static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
-		enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+		enum criteria *new_cr, ext4_group_t *group)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_group_info *grp = NULL;
@@ -1125,11 +1124,11 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
 	}
 
 	if (*new_cr == CR_POWER2_ALIGNED) {
-		ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups);
+		ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group);
 	} else if (*new_cr == CR_GOAL_LEN_FAST) {
-		ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups);
+		ext4_mb_choose_next_group_goal_fast(ac, new_cr, group);
 	} else if (*new_cr == CR_BEST_AVAIL_LEN) {
-		ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups);
+		ext4_mb_choose_next_group_best_avail(ac, new_cr, group);
 	} else {
 		/*
 		 * TODO: For CR=2, we can arrange groups in an rb tree sorted by
@@ -1233,6 +1232,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 	atomic64_add(period, &sbi->s_mb_generation_time);
 }
 
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+	int count;
+	int order = 1;
+	void *buddy;
+
+	while ((buddy = mb_find_buddy(e4b, order++, &count)))
+		mb_set_bits(buddy, 0, count);
+
+	e4b->bd_info->bb_fragments = 0;
+	memset(e4b->bd_info->bb_counters, 0,
+		sizeof(*e4b->bd_info->bb_counters) *
+		(e4b->bd_sb->s_blocksize_bits + 2));
+
+	ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+		e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
+}
+
 /* The buddy information is attached the buddy cache inode
  * for convenience. The information regarding each group
  * is loaded via ext4_mb_load_buddy. The information involve
@@ -1891,11 +1908,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	mb_check_buddy(e4b);
 	mb_free_blocks_double(inode, e4b, first, count);
 
-	this_cpu_inc(discard_pa_seq);
-	e4b->bd_info->bb_free += count;
-	if (first < e4b->bd_info->bb_first_free)
-		e4b->bd_info->bb_first_free = first;
-
 	/* access memory sequentially: check left neighbour,
 	 * clear range and then check right neighbour
 	 */
@@ -1909,21 +1921,31 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
 		ext4_fsblk_t blocknr;
 
+		/*
+		 * Fastcommit replay can free already freed blocks which
+		 * corrupts allocation info. Regenerate it.
+		 */
+		if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+			mb_regenerate_buddy(e4b);
+			goto check;
+		}
+
 		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 		blocknr += EXT4_C2B(sbi, block);
-		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
-			ext4_grp_locked_error(sb, e4b->bd_group,
-					      inode ? inode->i_ino : 0,
-					      blocknr,
-					      "freeing already freed block (bit %u); block bitmap corrupt.",
-					      block);
-			ext4_mark_group_bitmap_corrupted(
-				sb, e4b->bd_group,
+		ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
 				EXT4_GROUP_INFO_BBITMAP_CORRUPT);
-		}
-		goto done;
+		ext4_grp_locked_error(sb, e4b->bd_group,
+				      inode ? inode->i_ino : 0, blocknr,
+				      "freeing already freed block (bit %u); block bitmap corrupt.",
+				      block);
+		return;
 	}
 
+	this_cpu_inc(discard_pa_seq);
+	e4b->bd_info->bb_free += count;
+	if (first < e4b->bd_info->bb_first_free)
+		e4b->bd_info->bb_first_free = first;
+
 	/* let's maintain fragments counter */
 	if (left_is_free && right_is_free)
 		e4b->bd_info->bb_fragments--;
@@ -1948,9 +1970,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	if (first <= last)
 		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
 
-done:
 	mb_set_largest_free_order(sb, e4b->bd_info);
 	mb_update_avg_fragment_size(sb, e4b->bd_info);
+check:
 	mb_check_buddy(e4b);
 }
 
@@ -2276,6 +2298,9 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
 		return;
 
 	ext4_lock_group(ac->ac_sb, group);
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+		goto out;
+
 	max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
 
 	if (max > 0) {
@@ -2283,6 +2308,7 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
 		ext4_mb_use_best_found(ac, e4b);
 	}
 
+out:
 	ext4_unlock_group(ac->ac_sb, group);
 	ext4_mb_unload_buddy(e4b);
 }
@@ -2309,12 +2335,10 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 	if (err)
 		return err;
 
-	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
-		ext4_mb_unload_buddy(e4b);
-		return 0;
-	}
-
 	ext4_lock_group(ac->ac_sb, group);
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+		goto out;
+
 	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
 			     ac->ac_g_ex.fe_len, &ex);
 	ex.fe_logical = 0xDEADFA11; /* debug value */
@@ -2347,6 +2371,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 		ac->ac_b_ex = ex;
 		ext4_mb_use_best_found(ac, e4b);
 	}
+out:
 	ext4_unlock_group(ac->ac_sb, group);
 	ext4_mb_unload_buddy(e4b);
 
@@ -2380,12 +2405,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
 
 		k = mb_find_next_zero_bit(buddy, max, 0);
 		if (k >= max) {
+			ext4_mark_group_bitmap_corrupted(ac->ac_sb,
+					e4b->bd_group,
+					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 			ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
 				"%d free clusters of order %d. But found 0",
 				grp->bb_counters[i], i);
-			ext4_mark_group_bitmap_corrupted(ac->ac_sb,
-					 e4b->bd_group,
-					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 			break;
 		}
 		ac->ac_found++;
@@ -2436,12 +2461,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			 * free blocks even though group info says we
 			 * have free blocks
 			 */
+			ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
 					"%d free clusters as per "
 					"group info. But bitmap says 0",
 					free);
-			ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
-					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 			break;
 		}
 
@@ -2467,12 +2492,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 		if (WARN_ON(ex.fe_len <= 0))
 			break;
 		if (free < ex.fe_len) {
+			ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
 					"%d free clusters as per "
 					"group info. But got %d blocks",
 					free, ex.fe_len);
-			ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
-					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 			/*
 			 * The number of free blocks differs. This mostly
 			 * indicate that the bitmap is corrupt. So exit
@@ -3725,7 +3750,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 	return count;
 }
 
-int ext4_mb_release(struct super_block *sb)
+void ext4_mb_release(struct super_block *sb)
 {
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
@@ -3801,8 +3826,6 @@ int ext4_mb_release(struct super_block *sb)
 	}
 
 	free_percpu(sbi->s_locality_groups);
-
-	return 0;
 }
 
 static inline int ext4_issue_discard(struct super_block *sb,
@@ -5284,7 +5307,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
  * the caller MUST hold group/inode locks.
  * TODO: optimize the case when there are no in-core structures yet
  */
-static noinline_for_stack int
+static noinline_for_stack void
 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			struct ext4_prealloc_space *pa)
 {
@@ -5334,11 +5357,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 		 */
 	}
 	atomic_add(free, &sbi->s_mb_discarded);
-
-	return 0;
 }
 
-static noinline_for_stack int
+static noinline_for_stack void
 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 				struct ext4_prealloc_space *pa)
 {
@@ -5352,13 +5373,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 	if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
 		ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
 			     e4b->bd_group, group, pa->pa_pstart);
-		return 0;
+		return;
 	}
 	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
 	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
 	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
-
-	return 0;
 }
 
 /*
@@ -5479,7 +5498,7 @@ out_dbg:
  *
  * FIXME!! Make sure it is valid at all the call sites
  */
-void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
+void ext4_discard_preallocations(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -5491,9 +5510,8 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
 	struct rb_node *iter;
 	int err;
 
-	if (!S_ISREG(inode->i_mode)) {
+	if (!S_ISREG(inode->i_mode))
 		return;
-	}
 
 	if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
 		return;
@@ -5501,15 +5519,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
 	mb_debug(sb, "discard preallocation for inode %lu\n",
 		 inode->i_ino);
 	trace_ext4_discard_preallocations(inode,
-			atomic_read(&ei->i_prealloc_active), needed);
-
-	if (needed == 0)
-		needed = UINT_MAX;
+			atomic_read(&ei->i_prealloc_active));
 
 repeat:
 	/* first, collect all pa's in the inode */
 	write_lock(&ei->i_prealloc_lock);
-	for (iter = rb_first(&ei->i_prealloc_node); iter && needed;
+	for (iter = rb_first(&ei->i_prealloc_node); iter;
 	     iter = rb_next(iter)) {
 		pa = rb_entry(iter, struct ext4_prealloc_space,
 			      pa_node.inode_node);
@@ -5533,7 +5548,6 @@ repeat:
 			spin_unlock(&pa->pa_lock);
 			rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
 			list_add(&pa->u.pa_tmp_list, &list);
-			needed--;
 			continue;
 		}
 
@@ -5943,7 +5957,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 /*
  * release all resource we used in allocation
  */
-static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+static void ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_prealloc_space *pa = ac->ac_pa;
@@ -5980,7 +5994,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
 		mutex_unlock(&ac->ac_lg->lg_mutex);
 	ext4_mb_collect_stats(ac);
-	return 0;
 }
 
 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
@@ -6761,6 +6774,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
 	bool set_trimmed = false;
 	void *bitmap;
 
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+		return 0;
+
 	last = ext4_last_grp_cluster(sb, e4b->bd_group);
 	bitmap = e4b->bd_bitmap;
 	if (start == 0 && max >= last)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d7aeb5da7d86..56938532b4ce 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -192,7 +192,6 @@ struct ext4_allocation_context {
 	 */
 	ext4_grpblk_t	ac_orig_goal_len;
 
-	__u32 ac_groups_considered;
 	__u32 ac_flags;		/* allocation hints */
 	__u16 ac_groups_scanned;
 	__u16 ac_groups_linear_remaining;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3aa57376d9c2..7cd4afa4de1d 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -618,6 +618,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 		goto out;
 	o_end = o_start + len;
 
+	*moved_len = 0;
 	while (o_start < o_end) {
 		struct ext4_extent *ex;
 		ext4_lblk_t cur_blk, next_blk;
@@ -672,7 +673,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 		 */
 		ext4_double_up_write_data_sem(orig_inode, donor_inode);
 		/* Swap original branches with new branches */
-		move_extent_per_page(o_filp, donor_inode,
+		*moved_len += move_extent_per_page(o_filp, donor_inode,
 				     orig_page_index, donor_page_index,
 				     offset_in_page, cur_len,
 				     unwritten, &ret);
@@ -682,14 +683,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 		o_start += cur_len;
 		d_start += cur_len;
 	}
-	*moved_len = o_start - orig_blk;
-	if (*moved_len > len)
-		*moved_len = len;
 
 out:
 	if (*moved_len) {
-		ext4_discard_preallocations(orig_inode, 0);
-		ext4_discard_preallocations(donor_inode, 0);
+		ext4_discard_preallocations(orig_inode);
+		ext4_discard_preallocations(donor_inode);
 	}
 
 	ext4_free_ext_path(path);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 05b647e6bc19..5e4f65c14dfb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1762,7 +1762,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
 	struct buffer_head *bh;
 
 	err = ext4_fname_prepare_lookup(dir, dentry, &fname);
-	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		return NULL;
 	if (err)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dcba0f85dfe2..d068d550d271 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1525,7 +1525,7 @@ void ext4_clear_inode(struct inode *inode)
 	ext4_fc_del(inode);
 	invalidate_inode_buffers(inode);
 	clear_inode(inode);
-	ext4_discard_preallocations(inode, 0);
+	ext4_discard_preallocations(inode);
 	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
 	dquot_drop(inode);
 	if (EXT4_I(inode)->jinode) {
@@ -5484,6 +5484,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		goto failed_mount4;
 	}
 
+	generic_set_sb_d_ops(sb);
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 75bf1f88843c..645240cc0229 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
 
 	if (!dentry) {
 		bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
-		if (IS_ERR(bh))
-			return ERR_CAST(bh);
-		if (!bh || !ext4_buffer_uptodate(bh))
+		if (IS_ERR(bh) || !bh)
 			return ERR_PTR(-ECHILD);
+		if (!ext4_buffer_uptodate(bh)) {
+			brelse(bh);
+			return ERR_PTR(-ECHILD);
+		}
 	} else {
 		bh = ext4_bread(NULL, inode, 0, 0);
 		if (IS_ERR(bh))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 65294e3b0bef..4a4e60cdac4e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,6 +24,7 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/part_stat.h>
+#include <linux/rw_hint.h>
 #include <crypto/hash.h>
 
 #include <linux/fscrypt.h>
@@ -3364,17 +3365,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
 	return is_set_ckpt_flags(sbi, CP_ERROR_FLAG);
 }
 
-static inline bool is_dot_dotdot(const u8 *name, size_t len)
-{
-	if (len == 1 && name[0] == '.')
-		return true;
-
-	if (len == 2 && name[0] == '.' && name[1] == '.')
-		return true;
-
-	return false;
-}
-
 static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
 					size_t size, gfp_t flags)
 {
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b3bb815fc6aa..f7f63a567d86 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -531,7 +531,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	err = f2fs_prepare_lookup(dir, dentry, &fname);
-	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		goto out_splice;
 	if (err)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d45ab0992ae5..8d9bd880ffd8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4660,6 +4660,7 @@ try_onemore:
 		goto free_node_inode;
 	}
 
+	generic_set_sb_d_ops(sb);
 	sb->s_root = d_make_root(root); /* allocate root dentry */
 	if (!sb->s_root) {
 		err = -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..54cc85d3338e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -27,6 +27,7 @@
 #include <linux/memfd.h>
 #include <linux/compat.h>
 #include <linux/mount.h>
+#include <linux/rw_hint.h>
 
 #include <linux/poll.h>
 #include <asm/siginfo.h>
@@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
 }
 #endif
 
-static bool rw_hint_valid(enum rw_hint hint)
+static bool rw_hint_valid(u64 hint)
 {
+	BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
+	BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
+	BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
+	BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
+	BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
+	BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);
+
 	switch (hint) {
 	case RWH_WRITE_LIFE_NOT_SET:
 	case RWH_WRITE_LIFE_NONE:
@@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint)
 	}
 }
 
-static long fcntl_rw_hint(struct file *file, unsigned int cmd,
-			  unsigned long arg)
+static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
+			      unsigned long arg)
 {
 	struct inode *inode = file_inode(file);
 	u64 __user *argp = (u64 __user *)arg;
-	enum rw_hint hint;
-	u64 h;
+	u64 hint = READ_ONCE(inode->i_write_hint);
 
-	switch (cmd) {
-	case F_GET_RW_HINT:
-		h = inode->i_write_hint;
-		if (copy_to_user(argp, &h, sizeof(*argp)))
-			return -EFAULT;
-		return 0;
-	case F_SET_RW_HINT:
-		if (copy_from_user(&h, argp, sizeof(h)))
-			return -EFAULT;
-		hint = (enum rw_hint) h;
-		if (!rw_hint_valid(hint))
-			return -EINVAL;
+	if (copy_to_user(argp, &hint, sizeof(*argp)))
+		return -EFAULT;
+	return 0;
+}
 
-		inode_lock(inode);
-		inode->i_write_hint = hint;
-		inode_unlock(inode);
-		return 0;
-	default:
+static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	u64 __user *argp = (u64 __user *)arg;
+	u64 hint;
+
+	if (copy_from_user(&hint, argp, sizeof(hint)))
+		return -EFAULT;
+	if (!rw_hint_valid(hint))
 		return -EINVAL;
-	}
+
+	WRITE_ONCE(inode->i_write_hint, hint);
+
+	/*
+	 * file->f_mapping->host may differ from inode. As an example,
+	 * blkdev_open() modifies file->f_mapping.
+	 */
+	if (file->f_mapping->host != inode)
+		WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);
+
+	return 0;
 }
 
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
@@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		err = memfd_fcntl(filp, cmd, argi);
 		break;
 	case F_GET_RW_HINT:
+		err = fcntl_get_rw_hint(filp, cmd, arg);
+		break;
 	case F_SET_RW_HINT:
-		err = fcntl_rw_hint(filp, cmd, arg);
+		err = fcntl_set_rw_hint(filp, cmd, arg);
 		break;
 	default:
 		break;
@@ -846,12 +862,6 @@ int send_sigurg(struct fown_struct *fown)
 static DEFINE_SPINLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __ro_after_init;
 
-static void fasync_free_rcu(struct rcu_head *head)
-{
-	kmem_cache_free(fasync_cache,
-			container_of(head, struct fasync_struct, fa_rcu));
-}
-
 /*
  * Remove a fasync entry. If successfully removed, return
  * positive and clear the FASYNC flag. If no entry exists,
@@ -877,7 +887,7 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 		write_unlock_irq(&fa->fa_lock);
 
 		*fp = fa->fa_next;
-		call_rcu(&fa->fa_rcu, fasync_free_rcu);
+		kfree_rcu(fa, fa_rcu);
 		filp->f_flags &= ~FASYNC;
 		result = 1;
 		break;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 18b3ba8dc8ea..57a12614addf 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path,
 	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
 		return -EINVAL;
 
-	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+	handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
 			 GFP_KERNEL);
 	if (!handle)
 		return -ENOMEM;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d84fcc471c6..e4f17c53ddfc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb)
 	spin_unlock_irq(&wb->work_lock);
 }
 
+/*
+ * This function is used when the first inode for this wb is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+static void wb_wakeup_delayed(struct bdi_writeback *wb)
+{
+	unsigned long timeout;
+
+	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+	spin_lock_irq(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state))
+		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+	spin_unlock_irq(&wb->work_lock);
+}
+
 static void finish_writeback_work(struct bdi_writeback *wb,
 				  struct wb_writeback_work *work)
 {
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index edb3712dcfa5..a4d6ca0b8971 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -83,8 +83,8 @@ static const struct fs_parameter_spec *fs_lookup_key(
 }
 
 /*
- * fs_parse - Parse a filesystem configuration parameter
- * @fc: The filesystem context to log errors through.
+ * __fs_parse - Parse a filesystem configuration parameter
+ * @log: The filesystem context to log errors through.
  * @desc: The parameter description to use.
  * @param: The parameter.
  * @result: Where to place the result of the parse
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 91e89e68177e..b6cad106c37e 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -474,8 +474,7 @@ err:
 
 static void cuse_fc_release(struct fuse_conn *fc)
 {
-	struct cuse_conn *cc = fc_to_cc(fc);
-	kfree_rcu(cc, fc.rcu);
+	kfree(fc_to_cc(fc));
 }
 
 /**
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1df83eebda92..bcbe34488862 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -888,6 +888,7 @@ struct fuse_mount {
 
 	/* Entry on fc->mounts */
 	struct list_head fc_entry;
+	struct rcu_head rcu;
 };
 
 static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2a6d44f91729..516ea2979a90 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -930,6 +930,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
 }
 EXPORT_SYMBOL_GPL(fuse_conn_init);
 
+static void delayed_release(struct rcu_head *p)
+{
+	struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
+
+	put_user_ns(fc->user_ns);
+	fc->release(fc);
+}
+
 void fuse_conn_put(struct fuse_conn *fc)
 {
 	if (refcount_dec_and_test(&fc->count)) {
@@ -941,13 +949,12 @@ void fuse_conn_put(struct fuse_conn *fc)
 		if (fiq->ops->release)
 			fiq->ops->release(fiq);
 		put_pid_ns(fc->pid_ns);
-		put_user_ns(fc->user_ns);
 		bucket = rcu_dereference_protected(fc->curr_bucket, 1);
 		if (bucket) {
 			WARN_ON(atomic_read(&bucket->count) != 1);
 			kfree(bucket);
 		}
-		fc->release(fc);
+		call_rcu(&fc->rcu, delayed_release);
 	}
 }
 EXPORT_SYMBOL_GPL(fuse_conn_put);
@@ -1366,7 +1373,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init);
 void fuse_free_conn(struct fuse_conn *fc)
 {
 	WARN_ON(!list_empty(&fc->devices));
-	kfree_rcu(fc, rcu);
+	kfree(fc);
 }
 EXPORT_SYMBOL_GPL(fuse_free_conn);
 
@@ -1902,7 +1909,7 @@ static void fuse_sb_destroy(struct super_block *sb)
 void fuse_mount_destroy(struct fuse_mount *fm)
 {
 	fuse_conn_put(fm->fc);
-	kfree(fm);
+	kfree_rcu(fm, rcu);
 }
 EXPORT_SYMBOL(fuse_mount_destroy);
 
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index d9ccfd27e4f1..789af5c8fade 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -2465,7 +2465,7 @@ out:
 }
 
 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
-		loff_t offset)
+		loff_t offset, unsigned int len)
 {
 	int ret;
 
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 177f1f41f225..2e215e8c3c88 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -32,25 +32,21 @@
 
 static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
 {
-	struct dentry *parent = NULL;
+	struct dentry *parent;
 	struct gfs2_sbd *sdp;
 	struct gfs2_inode *dip;
-	struct inode *dinode, *inode;
+	struct inode *inode;
 	struct gfs2_holder d_gh;
 	struct gfs2_inode *ip = NULL;
 	int error, valid = 0;
 	int had_lock = 0;
 
-	if (flags & LOOKUP_RCU) {
-		dinode = d_inode_rcu(READ_ONCE(dentry->d_parent));
-		if (!dinode)
-			return -ECHILD;
-	} else {
-		parent = dget_parent(dentry);
-		dinode = d_inode(parent);
-	}
-	sdp = GFS2_SB(dinode);
-	dip = GFS2_I(dinode);
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	parent = dget_parent(dentry);
+	sdp = GFS2_SB(d_inode(parent));
+	dip = GFS2_I(d_inode(parent));
 	inode = d_inode(dentry);
 
 	if (inode) {
@@ -66,8 +62,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
 
 	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
 	if (!had_lock) {
-		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
-					   flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh);
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
 		if (error)
 			goto out;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6bfc9383b7b8..1b95db2c3aac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
 		WARN_ON_ONCE(!may_not_block);
 		return -ECHILD;
         }
-	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
-		int noblock = may_not_block ? GL_NOBLOCK : 0;
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
-					   LM_FLAG_ANY | noblock, &i_gh);
+	if (gfs2_glock_is_locked_by_me(gl) == NULL) {
+		if (may_not_block)
+			return -ECHILD;
+		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
 		if (error)
 			return error;
 	}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7ededcb720c1..012a3d003fbe 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -190,6 +190,7 @@ struct hfsplus_sb_info {
 	int work_queued;               /* non-zero delayed work is queued */
 	struct delayed_work sync_work; /* FS sync delayed work */
 	spinlock_t work_lock;          /* protects sync_work and work_queued */
+	struct rcu_head rcu;
 };
 
 #define HFSPLUS_SB_WRITEBACKUP	0
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1986b4f18a90..97920202790f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -277,6 +277,14 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb)
 	spin_unlock(&sbi->work_lock);
 }
 
+static void delayed_free(struct rcu_head *p)
+{
+	struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu);
+
+	unload_nls(sbi->nls);
+	kfree(sbi);
+}
+
 static void hfsplus_put_super(struct super_block *sb)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
@@ -302,9 +310,7 @@ static void hfsplus_put_super(struct super_block *sb)
 	hfs_btree_close(sbi->ext_tree);
 	kfree(sbi->s_vhdr_buf);
 	kfree(sbi->s_backup_vhdr_buf);
-	unload_nls(sbi->nls);
-	kfree(sb->s_fs_info);
-	sb->s_fs_info = NULL;
+	call_rcu(&sbi->rcu, delayed_free);
 }
 
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b0cb70400996..ce9346099c72 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -30,7 +30,7 @@ struct hfsplus_wd {
  * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
  * @buf: buffer for I/O
  * @data: output pointer for location of requested data
- * @opf: request op flags
+ * @opf: I/O operation type and flags
  *
  * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
  * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ea5b8e57d904..6502c7e776d1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,6 +100,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	loff_t len, vma_len;
 	int ret;
 	struct hstate *h = hstate_file(file);
+	vm_flags_t vm_flags;
 
 	/*
 	 * vma address alignment (but not the pgoff alignment) has
@@ -141,10 +142,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	file_accessed(file);
 
 	ret = -ENOMEM;
+
+	vm_flags = vma->vm_flags;
+	/*
+	 * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
+	 * reserving here. Note: only for SHM hugetlbfs file, the inode
+	 * flag S_PRIVATE is set.
+	 */
+	if (inode->i_flags & S_PRIVATE)
+		vm_flags |= VM_NORESERVE;
+
 	if (!hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
 				len >> huge_page_shift(h), vma,
-				vma->vm_flags))
+				vm_flags))
 		goto out;
 
 	ret = 0;
@@ -340,7 +351,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		} else {
 			folio_unlock(folio);
 
-			if (!folio_test_has_hwpoisoned(folio))
+			if (!folio_test_hwpoison(folio))
 				want = nr;
 			else {
 				/*
@@ -922,7 +933,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
 	unsigned int ia_valid = attr->ia_valid;
 	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 
-	error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+	error = setattr_prepare(idmap, dentry, attr);
 	if (error)
 		return error;
 
@@ -939,7 +950,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
 		hugetlb_vmtruncate(inode, newsize);
 	}
 
-	setattr_copy(&nop_mnt_idmap, inode, attr);
+	setattr_copy(idmap, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
@@ -974,6 +985,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
 
 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
+					struct mnt_idmap *idmap,
 					struct inode *dir,
 					umode_t mode, dev_t dev)
 {
@@ -995,7 +1007,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 
 		inode->i_ino = get_next_ino();
-		inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+		inode_init_owner(idmap, inode, dir, mode);
 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
@@ -1039,7 +1051,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 {
 	struct inode *inode;
 
-	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
+	inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev);
 	if (!inode)
 		return -ENOSPC;
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -1051,7 +1063,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 			   struct dentry *dentry, umode_t mode)
 {
-	int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry,
+	int retval = hugetlbfs_mknod(idmap, dir, dentry,
 				     mode | S_IFDIR, 0);
 	if (!retval)
 		inc_nlink(dir);
@@ -1062,7 +1074,7 @@ static int hugetlbfs_create(struct mnt_idmap *idmap,
 			    struct inode *dir, struct dentry *dentry,
 			    umode_t mode, bool excl)
 {
-	return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0);
+	return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
 }
 
 static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
@@ -1071,7 +1083,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
 {
 	struct inode *inode;
 
-	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
+	inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0);
 	if (!inode)
 		return -ENOSPC;
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -1083,10 +1095,11 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
 			     struct inode *dir, struct dentry *dentry,
 			     const char *symname)
 {
+	const umode_t mode = S_IFLNK|S_IRWXUGO;
 	struct inode *inode;
 	int error = -ENOSPC;
 
-	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+	inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0);
 	if (inode) {
 		int l = strlen(symname)+1;
 		error = page_symlink(inode, symname, l);
@@ -1354,6 +1367,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
 {
 	struct hugetlbfs_fs_context *ctx = fc->fs_private;
 	struct fs_parse_result result;
+	struct hstate *h;
 	char *rest;
 	unsigned long ps;
 	int opt;
@@ -1398,11 +1412,12 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
 
 	case Opt_pagesize:
 		ps = memparse(param->string, &rest);
-		ctx->hstate = size_to_hstate(ps);
-		if (!ctx->hstate) {
+		h = size_to_hstate(ps);
+		if (!h) {
 			pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
 			return -EINVAL;
 		}
+		ctx->hstate = h;
 		return 0;
 
 	case Opt_min_size:
@@ -1553,6 +1568,7 @@ static struct file_system_type hugetlbfs_fs_type = {
 	.init_fs_context	= hugetlbfs_init_fs_context,
 	.parameters		= hugetlb_fs_parameters,
 	.kill_sb		= kill_litter_super,
+	.fs_flags               = FS_ALLOW_IDMAP,
 };
 
 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1606,7 +1622,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 	}
 
 	file = ERR_PTR(-ENOSPC);
-	inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
+	/* hugetlbfs_vfsmount[] mounts do not use idmapped mounts.  */
+	inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL,
+				    S_IFREG | S_IRWXUGO, 0);
 	if (!inode)
 		goto out;
 	if (creat_flags == HUGETLB_SHMFS_INODE)
diff --git a/fs/inode.c b/fs/inode.c
index 91048c4c9c9e..d290f007b3d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,6 +20,7 @@
 #include <linux/ratelimit.h>
 #include <linux/list_lru.h>
 #include <linux/iversion.h>
+#include <linux/rw_hint.h>
 #include <trace/events/writeback.h>
 #include "internal.h"
 
@@ -588,7 +589,8 @@ void dump_mapping(const struct address_space *mapping)
 	}
 
 	dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
-	if (get_kernel_nofault(dentry, dentry_ptr)) {
+	if (get_kernel_nofault(dentry, dentry_ptr) ||
+	    !dentry.d_parent || !dentry.d_name.name) {
 		pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
 				a_ops, ino, dentry_ptr);
 		return;
@@ -2285,7 +2287,7 @@ void __init inode_init(void)
 					 sizeof(struct inode),
 					 0,
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-					 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+					 SLAB_ACCOUNT),
 					 init_once);
 
 	/* Hash may have been set up in inode_init_early */
@@ -2509,7 +2511,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
 {
 	struct timespec64 now = current_time(inode);
 
-	inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
+	inode_set_ctime_to_ts(inode, now);
 	return now;
 }
 EXPORT_SYMBOL(inode_set_ctime_current);
diff --git a/fs/internal.h b/fs/internal.h
index b67406435fc0..7d3edcdf59cc 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -310,3 +310,10 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
 struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
 void mnt_idmap_put(struct mnt_idmap *idmap);
+struct stashed_operations {
+	void (*put_data)(void *data);
+	void (*init_inode)(struct inode *inode, void *data);
+};
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+		      struct vfsmount *mnt, void *data, struct path *path);
+void stashed_dentry_prune(struct dentry *dentry);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 093c4515b22a..4e8e41c8b3c0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2010 Red Hat, Inc.
- * Copyright (C) 2016-2019 Christoph Hellwig.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
  */
 #include <linux/module.h>
 #include <linux/compiler.h>
@@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio,
 	return test_bit(block + blks_per_folio, ifs->state);
 }
 
+static unsigned ifs_find_dirty_range(struct folio *folio,
+		struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned start_blk =
+		offset_in_folio(folio, *range_start) >> inode->i_blkbits;
+	unsigned end_blk = min_not_zero(
+		offset_in_folio(folio, range_end) >> inode->i_blkbits,
+		i_blocks_per_folio(inode, folio));
+	unsigned nblks = 1;
+
+	while (!ifs_block_is_dirty(folio, ifs, start_blk))
+		if (++start_blk == end_blk)
+			return 0;
+
+	while (start_blk + nblks < end_blk) {
+		if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
+			break;
+		nblks++;
+	}
+
+	*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
+	return nblks << inode->i_blkbits;
+}
+
+static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
+		u64 range_end)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (*range_start >= range_end)
+		return 0;
+
+	if (ifs)
+		return ifs_find_dirty_range(folio, ifs, range_start, range_end);
+	return range_end - *range_start;
+}
+
 static void ifs_clear_range_dirty(struct folio *folio,
 		struct iomap_folio_state *ifs, size_t off, size_t len)
 {
@@ -1454,15 +1492,10 @@ out_unlock:
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
 
 static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
-		size_t len, int error)
+		size_t len)
 {
 	struct iomap_folio_state *ifs = folio->private;
 
-	if (error) {
-		folio_set_error(folio);
-		mapping_set_error(inode->i_mapping, error);
-	}
-
 	WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
 	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
 
@@ -1479,40 +1512,29 @@ static u32
 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
 {
 	struct inode *inode = ioend->io_inode;
-	struct bio *bio = &ioend->io_inline_bio;
-	struct bio *last = ioend->io_bio, *next;
-	u64 start = bio->bi_iter.bi_sector;
-	loff_t offset = ioend->io_offset;
-	bool quiet = bio_flagged(bio, BIO_QUIET);
+	struct bio *bio = &ioend->io_bio;
+	struct folio_iter fi;
 	u32 folio_count = 0;
 
-	for (bio = &ioend->io_inline_bio; bio; bio = next) {
-		struct folio_iter fi;
-
-		/*
-		 * For the last bio, bi_private points to the ioend, so we
-		 * need to explicitly end the iteration here.
-		 */
-		if (bio == last)
-			next = NULL;
-		else
-			next = bio->bi_private;
-
-		/* walk all folios in bio, ending page IO on them */
-		bio_for_each_folio_all(fi, bio) {
-			iomap_finish_folio_write(inode, fi.folio, fi.length,
-					error);
-			folio_count++;
+	if (error) {
+		mapping_set_error(inode->i_mapping, error);
+		if (!bio_flagged(bio, BIO_QUIET)) {
+			pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+				inode->i_sb->s_id, inode->i_ino,
+				ioend->io_offset, ioend->io_sector);
 		}
-		bio_put(bio);
 	}
-	/* The ioend has been freed by bio_put() */
 
-	if (unlikely(error && !quiet)) {
-		printk_ratelimited(KERN_ERR
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
-			inode->i_sb->s_id, inode->i_ino, offset, start);
+	/* walk all folios in bio, ending page IO on them */
+	bio_for_each_folio_all(fi, bio) {
+		if (error)
+			folio_set_error(fi.folio);
+		iomap_finish_folio_write(inode, fi.folio, fi.length);
+		folio_count++;
 	}
+
+	bio_put(bio);	/* frees the ioend */
 	return folio_count;
 }
 
@@ -1553,7 +1575,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends);
 static bool
 iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 {
-	if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+	if (ioend->io_bio.bi_status != next->io_bio.bi_status)
 		return false;
 	if ((ioend->io_flags & IOMAP_F_SHARED) ^
 	    (next->io_flags & IOMAP_F_SHARED))
@@ -1618,47 +1640,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends);
 
 static void iomap_writepage_end_bio(struct bio *bio)
 {
-	struct iomap_ioend *ioend = bio->bi_private;
-
-	iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
+	iomap_finish_ioend(iomap_ioend_from_bio(bio),
+			blk_status_to_errno(bio->bi_status));
 }
 
 /*
  * Submit the final bio for an ioend.
  *
  * If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback
- * and unlocked them.  In this situation, we need to fail the bio instead of
- * submitting it.  This typically only happens on a filesystem shutdown.
+ * the submission process has failed after we've marked pages for writeback.
+ * We cannot cancel ioend directly in that case, so call the bio end I/O handler
+ * with the error status here to run the normal I/O completion handler to clear
+ * the writeback bit and let the file system proess the errors.
  */
-static int
-iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
-		int error)
+static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
 {
-	ioend->io_bio->bi_private = ioend;
-	ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
+	if (!wpc->ioend)
+		return error;
 
+	/*
+	 * Let the file systems prepare the I/O submission and hook in an I/O
+	 * comletion handler.  This also needs to happen in case after a
+	 * failure happened so that the file system end I/O handler gets called
+	 * to clean up.
+	 */
 	if (wpc->ops->prepare_ioend)
-		error = wpc->ops->prepare_ioend(ioend, error);
+		error = wpc->ops->prepare_ioend(wpc->ioend, error);
+
 	if (error) {
-		/*
-		 * If we're failing the IO now, just mark the ioend with an
-		 * error and finish it.  This will run IO completion immediately
-		 * as there is only one reference to the ioend at this point in
-		 * time.
-		 */
-		ioend->io_bio->bi_status = errno_to_blk_status(error);
-		bio_endio(ioend->io_bio);
-		return error;
+		wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+		bio_endio(&wpc->ioend->io_bio);
+	} else {
+		submit_bio(&wpc->ioend->io_bio);
 	}
 
-	submit_bio(ioend->io_bio);
-	return 0;
+	wpc->ioend = NULL;
+	return error;
 }
 
-static struct iomap_ioend *
-iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
-		loff_t offset, sector_t sector, struct writeback_control *wbc)
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct inode *inode, loff_t pos)
 {
 	struct iomap_ioend *ioend;
 	struct bio *bio;
@@ -1666,63 +1687,42 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
 	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
 			       REQ_OP_WRITE | wbc_to_write_flags(wbc),
 			       GFP_NOFS, &iomap_ioend_bioset);
-	bio->bi_iter.bi_sector = sector;
+	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+	bio->bi_end_io = iomap_writepage_end_bio;
 	wbc_init_bio(wbc, bio);
+	bio->bi_write_hint = inode->i_write_hint;
 
-	ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
+	ioend = iomap_ioend_from_bio(bio);
 	INIT_LIST_HEAD(&ioend->io_list);
 	ioend->io_type = wpc->iomap.type;
 	ioend->io_flags = wpc->iomap.flags;
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
-	ioend->io_folios = 0;
-	ioend->io_offset = offset;
-	ioend->io_bio = bio;
-	ioend->io_sector = sector;
-	return ioend;
-}
-
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in iomap_finish_ioend().
- */
-static struct bio *
-iomap_chain_bio(struct bio *prev)
-{
-	struct bio *new;
-
-	new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
-	bio_clone_blkg_association(new, prev);
-	new->bi_iter.bi_sector = bio_end_sector(prev);
+	ioend->io_offset = pos;
+	ioend->io_sector = bio->bi_iter.bi_sector;
 
-	bio_chain(prev, new);
-	bio_get(prev);		/* for iomap_finish_ioend */
-	submit_bio(prev);
-	return new;
+	wpc->nr_folios = 0;
+	return ioend;
 }
 
-static bool
-iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
-		sector_t sector)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
 {
 	if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
 	    (wpc->ioend->io_flags & IOMAP_F_SHARED))
 		return false;
 	if (wpc->iomap.type != wpc->ioend->io_type)
 		return false;
-	if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
+	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
 		return false;
-	if (sector != bio_end_sector(wpc->ioend->io_bio))
+	if (iomap_sector(&wpc->iomap, pos) !=
+	    bio_end_sector(&wpc->ioend->io_bio))
 		return false;
 	/*
 	 * Limit ioend bio chain lengths to minimise IO completion latency. This
 	 * also prevents long tight loops ending page writeback on all the
 	 * folios in the ioend.
 	 */
-	if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
+	if (wpc->nr_folios >= IOEND_BATCH_SIZE)
 		return false;
 	return true;
 }
@@ -1730,255 +1730,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
 /*
  * Test to see if we have an existing ioend structure that we could append to
  * first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly.  Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
  */
-static void
-iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
-		struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct list_head *iolist)
+static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct folio *folio,
+		struct inode *inode, loff_t pos, unsigned len)
 {
-	sector_t sector = iomap_sector(&wpc->iomap, pos);
-	unsigned len = i_blocksize(inode);
+	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
+	int error;
 
-	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
-		if (wpc->ioend)
-			list_add(&wpc->ioend->io_list, iolist);
-		wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
+	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+new_ioend:
+		error = iomap_submit_ioend(wpc, 0);
+		if (error)
+			return error;
+		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
 	}
 
-	if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
-		wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
-		bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
-	}
+	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+		goto new_ioend;
 
 	if (ifs)
 		atomic_add(len, &ifs->write_bytes_pending);
 	wpc->ioend->io_size += len;
 	wbc_account_cgroup_owner(wbc, &folio->page, len);
+	return 0;
 }
 
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * the forward progress guarantees we need to provide. The current ioend we're
- * adding blocks to is cached in the writepage context, and if the new block
- * doesn't append to the cached ioend, it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected.  While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int
-iomap_writepage_map(struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct inode *inode,
-		struct folio *folio, u64 end_pos)
+static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct folio *folio,
+		struct inode *inode, u64 pos, unsigned dirty_len,
+		unsigned *count)
 {
-	struct iomap_folio_state *ifs = folio->private;
-	struct iomap_ioend *ioend, *next;
-	unsigned len = i_blocksize(inode);
-	unsigned nblocks = i_blocks_per_folio(inode, folio);
-	u64 pos = folio_pos(folio);
-	int error = 0, count = 0, i;
-	LIST_HEAD(submit_list);
-
-	WARN_ON_ONCE(end_pos <= pos);
-
-	if (!ifs && nblocks > 1) {
-		ifs = ifs_alloc(inode, folio, 0);
-		iomap_set_range_dirty(folio, 0, end_pos - pos);
-	}
+	int error;
 
-	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
-
-	/*
-	 * Walk through the folio to find areas to write back. If we
-	 * run off the end of the current map or find the current map
-	 * invalid, grab a new one.
-	 */
-	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
-			continue;
+	do {
+		unsigned map_len;
 
-		error = wpc->ops->map_blocks(wpc, inode, pos);
+		error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
 		if (error)
 			break;
-		trace_iomap_writepage_map(inode, &wpc->iomap);
-		if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
-			continue;
-		if (wpc->iomap.type == IOMAP_HOLE)
-			continue;
-		iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
-				 &submit_list);
-		count++;
-	}
-	if (count)
-		wpc->ioend->io_folios++;
+		trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
 
-	WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
-	WARN_ON_ONCE(!folio_test_locked(folio));
-	WARN_ON_ONCE(folio_test_writeback(folio));
-	WARN_ON_ONCE(folio_test_dirty(folio));
+		map_len = min_t(u64, dirty_len,
+			wpc->iomap.offset + wpc->iomap.length - pos);
+		WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+		switch (wpc->iomap.type) {
+		case IOMAP_INLINE:
+			WARN_ON_ONCE(1);
+			error = -EIO;
+			break;
+		case IOMAP_HOLE:
+			break;
+		default:
+			error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
+					map_len);
+			if (!error)
+				(*count)++;
+			break;
+		}
+		dirty_len -= map_len;
+		pos += map_len;
+	} while (dirty_len && !error);
 
 	/*
 	 * We cannot cancel the ioend directly here on error.  We may have
 	 * already set other pages under writeback and hence we have to run I/O
 	 * completion to mark the error state of the pages under writeback
 	 * appropriately.
+	 *
+	 * Just let the file system know what portion of the folio failed to
+	 * map.
 	 */
-	if (unlikely(error)) {
-		/*
-		 * Let the filesystem know what portion of the current page
-		 * failed to map. If the page hasn't been added to ioend, it
-		 * won't be affected by I/O completion and we must unlock it
-		 * now.
-		 */
-		if (wpc->ops->discard_folio)
-			wpc->ops->discard_folio(folio, pos);
-		if (!count) {
-			folio_unlock(folio);
-			goto done;
-		}
-	}
-
-	/*
-	 * We can have dirty bits set past end of file in page_mkwrite path
-	 * while mapping the last partial folio. Hence it's better to clear
-	 * all the dirty bits in the folio here.
-	 */
-	iomap_clear_range_dirty(folio, 0, folio_size(folio));
-	folio_start_writeback(folio);
-	folio_unlock(folio);
-
-	/*
-	 * Preserve the original error if there was one; catch
-	 * submission errors here and propagate into subsequent ioend
-	 * submissions.
-	 */
-	list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
-		int error2;
-
-		list_del_init(&ioend->io_list);
-		error2 = iomap_submit_ioend(wpc, ioend, error);
-		if (error2 && !error)
-			error = error2;
-	}
-
-	/*
-	 * We can end up here with no error and nothing to write only if we race
-	 * with a partial page truncate on a sub-page block sized filesystem.
-	 */
-	if (!count)
-		folio_end_writeback(folio);
-done:
-	mapping_set_error(inode->i_mapping, error);
+	if (error && wpc->ops->discard_folio)
+		wpc->ops->discard_folio(folio, pos);
 	return error;
 }
 
 /*
- * Write out a dirty page.
+ * Check interaction of the folio with the file end.
  *
- * For delalloc space on the page, we need to allocate space and flush it.
- * For unwritten space on the page, we need to start the conversion to
- * regular allocated space.
+ * If the folio is entirely beyond i_size, return false.  If it straddles
+ * i_size, adjust end_pos and zero all data beyond i_size.
  */
-static int iomap_do_writepage(struct folio *folio,
-		struct writeback_control *wbc, void *data)
+static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+		u64 *end_pos)
 {
-	struct iomap_writepage_ctx *wpc = data;
-	struct inode *inode = folio->mapping->host;
-	u64 end_pos, isize;
-
-	trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
+	u64 isize = i_size_read(inode);
 
-	/*
-	 * Refuse to write the folio out if we're called from reclaim context.
-	 *
-	 * This avoids stack overflows when called from deeply used stacks in
-	 * random callers for direct reclaim or memcg reclaim.  We explicitly
-	 * allow reclaim from kswapd as the stack usage there is relatively low.
-	 *
-	 * This should never happen except in the case of a VM regression so
-	 * warn about it.
-	 */
-	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
-			PF_MEMALLOC))
-		goto redirty;
-
-	/*
-	 * Is this folio beyond the end of the file?
-	 *
-	 * The folio index is less than the end_index, adjust the end_pos
-	 * to the highest offset that this folio should represent.
-	 * -----------------------------------------------------
-	 * |			file mapping	       | <EOF> |
-	 * -----------------------------------------------------
-	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
-	 * ^--------------------------------^----------|--------
-	 * |     desired writeback range    |      see else    |
-	 * ---------------------------------^------------------|
-	 */
-	isize = i_size_read(inode);
-	end_pos = folio_pos(folio) + folio_size(folio);
-	if (end_pos > isize) {
-		/*
-		 * Check whether the page to write out is beyond or straddles
-		 * i_size or not.
-		 * -------------------------------------------------------
-		 * |		file mapping		        | <EOF>  |
-		 * -------------------------------------------------------
-		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
-		 * ^--------------------------------^-----------|---------
-		 * |				    |      Straddles     |
-		 * ---------------------------------^-----------|--------|
-		 */
+	if (*end_pos > isize) {
 		size_t poff = offset_in_folio(folio, isize);
 		pgoff_t end_index = isize >> PAGE_SHIFT;
 
 		/*
-		 * Skip the page if it's fully outside i_size, e.g.
-		 * due to a truncate operation that's in progress.  We've
-		 * cleaned this page and truncate will finish things off for
-		 * us.
+		 * If the folio is entirely ouside of i_size, skip it.
+		 *
+		 * This can happen due to a truncate operation that is in
+		 * progress and in that case truncate will finish it off once
+		 * we've dropped the folio lock.
 		 *
-		 * Note that the end_index is unsigned long.  If the given
-		 * offset is greater than 16TB on a 32-bit system then if we
-		 * checked if the page is fully outside i_size with
-		 * "if (page->index >= end_index + 1)", "end_index + 1" would
-		 * overflow and evaluate to 0.  Hence this page would be
+		 * Note that the pgoff_t used for end_index is an unsigned long.
+		 * If the given offset is greater than 16TB on a 32-bit system,
+		 * then if we checked if the folio is fully outside i_size with
+		 * "if (folio->index >= end_index + 1)", "end_index + 1" would
+		 * overflow and evaluate to 0.  Hence this folio would be
 		 * redirtied and written out repeatedly, which would result in
 		 * an infinite loop; the user program performing this operation
 		 * would hang.  Instead, we can detect this situation by
-		 * checking if the page is totally beyond i_size or if its
+		 * checking if the folio is totally beyond i_size or if its
 		 * offset is just equal to the EOF.
 		 */
 		if (folio->index > end_index ||
 		    (folio->index == end_index && poff == 0))
-			goto unlock;
+			return false;
 
 		/*
-		 * The page straddles i_size.  It must be zeroed out on each
-		 * and every writepage invocation because it may be mmapped.
-		 * "A file is mapped in multiples of the page size.  For a file
-		 * that is not a multiple of the page size, the remaining
-		 * memory is zeroed when mapped, and writes to that region are
-		 * not written out to the file."
+		 * The folio straddles i_size.
+		 *
+		 * It must be zeroed out on each and every writepage invocation
+		 * because it may be mmapped:
+		 *
+		 *    A file is mapped in multiples of the page size.  For a
+		 *    file that is not a multiple of the page size, the
+		 *    remaining memory is zeroed when mapped, and writes to that
+		 *    region are not written out to the file.
+		 *
+		 * Also adjust the writeback range to skip all blocks entirely
+		 * beyond i_size.
 		 */
 		folio_zero_segment(folio, poff, folio_size(folio));
-		end_pos = isize;
+		*end_pos = round_up(isize, i_blocksize(inode));
+	}
+
+	return true;
+}
+
+static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct folio *folio)
+{
+	struct iomap_folio_state *ifs = folio->private;
+	struct inode *inode = folio->mapping->host;
+	u64 pos = folio_pos(folio);
+	u64 end_pos = pos + folio_size(folio);
+	unsigned count = 0;
+	int error = 0;
+	u32 rlen;
+
+	WARN_ON_ONCE(!folio_test_locked(folio));
+	WARN_ON_ONCE(folio_test_dirty(folio));
+	WARN_ON_ONCE(folio_test_writeback(folio));
+
+	trace_iomap_writepage(inode, pos, folio_size(folio));
+
+	if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
+		folio_unlock(folio);
+		return 0;
+	}
+	WARN_ON_ONCE(end_pos <= pos);
+
+	if (i_blocks_per_folio(inode, folio) > 1) {
+		if (!ifs) {
+			ifs = ifs_alloc(inode, folio, 0);
+			iomap_set_range_dirty(folio, 0, end_pos - pos);
+		}
+
+		/*
+		 * Keep the I/O completion handler from clearing the writeback
+		 * bit until we have submitted all blocks by adding a bias to
+		 * ifs->write_bytes_pending, which is dropped after submitting
+		 * all blocks.
+		 */
+		WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+		atomic_inc(&ifs->write_bytes_pending);
 	}
 
-	return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
+	/*
+	 * Set the writeback bit ASAP, as the I/O completion for the single
+	 * block per folio case happen hit as soon as we're submitting the bio.
+	 */
+	folio_start_writeback(folio);
 
-redirty:
-	folio_redirty_for_writepage(wbc, folio);
-unlock:
+	/*
+	 * Walk through the folio to find dirty areas to write back.
+	 */
+	while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
+				pos, rlen, &count);
+		if (error)
+			break;
+		pos += rlen;
+	}
+
+	if (count)
+		wpc->nr_folios++;
+
+	/*
+	 * We can have dirty bits set past end of file in page_mkwrite path
+	 * while mapping the last partial folio. Hence it's better to clear
+	 * all the dirty bits in the folio here.
+	 */
+	iomap_clear_range_dirty(folio, 0, folio_size(folio));
+
+	/*
+	 * Usually the writeback bit is cleared by the I/O completion handler.
+	 * But we may end up either not actually writing any blocks, or (when
+	 * there are multiple blocks in a folio) all I/O might have finished
+	 * already at this point.  In that case we need to clear the writeback
+	 * bit ourselves right after unlocking the page.
+	 */
 	folio_unlock(folio);
-	return 0;
+	if (ifs) {
+		if (atomic_dec_and_test(&ifs->write_bytes_pending))
+			folio_end_writeback(folio);
+	} else {
+		if (!count)
+			folio_end_writeback(folio);
+	}
+	mapping_set_error(inode->i_mapping, error);
+	return error;
+}
+
+static int iomap_do_writepage(struct folio *folio,
+		struct writeback_control *wbc, void *data)
+{
+	return iomap_writepage_map(data, wbc, folio);
 }
 
 int
@@ -1988,18 +1971,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
 {
 	int			ret;
 
+	/*
+	 * Writeback from reclaim context should never happen except in the case
+	 * of a VM regression so warn about it and refuse to write the data.
+	 */
+	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) ==
+			PF_MEMALLOC))
+		return -EIO;
+
 	wpc->ops = ops;
 	ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
-	if (!wpc->ioend)
-		return ret;
-	return iomap_submit_ioend(wpc, wpc->ioend, ret);
+	return iomap_submit_ioend(wpc, ret);
 }
 EXPORT_SYMBOL_GPL(iomap_writepages);
 
 static int __init iomap_init(void)
 {
 	return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
-			   offsetof(struct iomap_ioend, io_inline_bio),
+			   offsetof(struct iomap_ioend, io_bio),
 			   BIOSET_NEED_BVECS);
 }
 fs_initcall(iomap_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..f3b43d223a46 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
 					  GFP_KERNEL);
 		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+		bio->bi_write_hint = inode->i_write_hint;
 		bio->bi_ioprio = dio->iocb->ki_ioprio;
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..0a991c4ce87d 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name,	\
 	TP_ARGS(inode, iomap))
 DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
 DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
-DEFINE_IOMAP_EVENT(iomap_writepage_map);
+
+TRACE_EVENT(iomap_writepage_map,
+	TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
+		 struct iomap *iomap),
+	TP_ARGS(inode, pos, dirty_len, iomap),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u64, ino)
+		__field(u64, pos)
+		__field(u64, dirty_len)
+		__field(u64, addr)
+		__field(loff_t, offset)
+		__field(u64, length)
+		__field(u16, type)
+		__field(u16, flags)
+		__field(dev_t, bdev)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->pos = pos;
+		__entry->dirty_len = dirty_len;
+		__entry->addr = iomap->addr;
+		__entry->offset = iomap->offset;
+		__entry->length = iomap->length;
+		__entry->type = iomap->type;
+		__entry->flags = iomap->flags;
+		__entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
+	),
+	TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
+		  "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  MAJOR(__entry->bdev), MINOR(__entry->bdev),
+		  __entry->pos,
+		  __entry->dirty_len,
+		  __entry->addr,
+		  __entry->offset,
+		  __entry->length,
+		  __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
+		  __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+);
 
 TRACE_EVENT(iomap_iter,
 	TP_PROTO(struct iomap_iter *iter, const void *ops,
@@ -165,6 +206,7 @@ TRACE_EVENT(iomap_iter,
 		__field(u64, ino)
 		__field(loff_t, pos)
 		__field(u64, length)
+		__field(s64, processed)
 		__field(unsigned int, flags)
 		__field(const void *, ops)
 		__field(unsigned long, caller)
@@ -174,15 +216,17 @@ TRACE_EVENT(iomap_iter,
 		__entry->ino = iter->inode->i_ino;
 		__entry->pos = iter->pos;
 		__entry->length = iomap_length(iter);
+		__entry->processed = iter->processed;
 		__entry->flags = iter->flags;
 		__entry->ops = ops;
 		__entry->caller = caller;
 	),
-	TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS",
+	TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		   __entry->ino,
 		   __entry->pos,
 		   __entry->length,
+		   __entry->processed,
 		   __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
 		   __entry->flags,
 		   __entry->ops,
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 8eec84c651bf..cb3cda1390ad 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2763,9 +2763,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl)
  *	leafno	- the number of the leaf to be updated.
  *	newval	- the new value for the leaf.
  *
- * RETURN VALUES:
- *  0		- success
- *	-EIO	- i/o error
+ * RETURN VALUES: none
  */
 static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
 {
@@ -2792,10 +2790,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
 		 * get the buddy size (number of words covered) of
 		 * the new value.
 		 */
-
-		if ((newval - tp->dmt_budmin) > BUDMIN)
-			return -EIO;
-
 		budsz = BUDSIZE(newval, tp->dmt_budmin);
 
 		/* try to join.
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 8d8e556bd610..73f09a762b79 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -932,7 +932,7 @@ static int __init init_jfs_fs(void)
 
 	jfs_inode_cachep =
 	    kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info),
-			0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+			0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
 			offsetof(struct jfs_inode_info, i_inline_all),
 			sizeof_field(struct jfs_inode_info, i_inline_all),
 			init_once);
diff --git a/fs/libfs.c b/fs/libfs.c
index 8b67cb4655d5..0d14ae808fcf 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -23,6 +23,7 @@
 #include <linux/fsnotify.h>
 #include <linux/unicode.h>
 #include <linux/fscrypt.h>
+#include <linux/pidfs.h>
 
 #include <linux/uaccess.h>
 
@@ -240,17 +241,22 @@ const struct inode_operations simple_dir_inode_operations = {
 };
 EXPORT_SYMBOL(simple_dir_inode_operations);
 
-static void offset_set(struct dentry *dentry, u32 offset)
+/* 0 is '.', 1 is '..', so always start with offset 2 or more */
+enum {
+	DIR_OFFSET_MIN	= 2,
+};
+
+static void offset_set(struct dentry *dentry, long offset)
 {
-	dentry->d_fsdata = (void *)((uintptr_t)(offset));
+	dentry->d_fsdata = (void *)offset;
 }
 
-static u32 dentry2offset(struct dentry *dentry)
+static long dentry2offset(struct dentry *dentry)
 {
-	return (u32)((uintptr_t)(dentry->d_fsdata));
+	return (long)dentry->d_fsdata;
 }
 
-static struct lock_class_key simple_offset_xa_lock;
+static struct lock_class_key simple_offset_lock_class;
 
 /**
  * simple_offset_init - initialize an offset_ctx
@@ -259,11 +265,9 @@ static struct lock_class_key simple_offset_xa_lock;
  */
 void simple_offset_init(struct offset_ctx *octx)
 {
-	xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
-	lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock);
-
-	/* 0 is '.', 1 is '..', so always start with offset 2 */
-	octx->next_offset = 2;
+	mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE);
+	lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class);
+	octx->next_offset = DIR_OFFSET_MIN;
 }
 
 /**
@@ -271,20 +275,19 @@ void simple_offset_init(struct offset_ctx *octx)
  * @octx: directory offset ctx to be updated
  * @dentry: new dentry being added
  *
- * Returns zero on success. @so_ctx and the dentry offset are updated.
+ * Returns zero on success. @octx and the dentry's offset are updated.
  * Otherwise, a negative errno value is returned.
  */
 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
 {
-	static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
-	u32 offset;
+	unsigned long offset;
 	int ret;
 
 	if (dentry2offset(dentry) != 0)
 		return -EBUSY;
 
-	ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
-			      &octx->next_offset, GFP_KERNEL);
+	ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
+				 LONG_MAX, &octx->next_offset, GFP_KERNEL);
 	if (ret < 0)
 		return ret;
 
@@ -300,17 +303,49 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
  */
 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
 {
-	u32 offset;
+	long offset;
 
 	offset = dentry2offset(dentry);
 	if (offset == 0)
 		return;
 
-	xa_erase(&octx->xa, offset);
+	mtree_erase(&octx->mt, offset);
 	offset_set(dentry, 0);
 }
 
 /**
+ * simple_offset_empty - Check if a dentry can be unlinked
+ * @dentry: dentry to be tested
+ *
+ * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
+ */
+int simple_offset_empty(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct offset_ctx *octx;
+	struct dentry *child;
+	unsigned long index;
+	int ret = 1;
+
+	if (!inode || !S_ISDIR(inode->i_mode))
+		return ret;
+
+	index = DIR_OFFSET_MIN;
+	octx = inode->i_op->get_offset_ctx(inode);
+	mt_for_each(&octx->mt, child, index, LONG_MAX) {
+		spin_lock(&child->d_lock);
+		if (simple_positive(child)) {
+			spin_unlock(&child->d_lock);
+			ret = 0;
+			break;
+		}
+		spin_unlock(&child->d_lock);
+	}
+
+	return ret;
+}
+
+/**
  * simple_offset_rename_exchange - exchange rename with directory offsets
  * @old_dir: parent of dentry being moved
  * @old_dentry: dentry being moved
@@ -327,8 +362,8 @@ int simple_offset_rename_exchange(struct inode *old_dir,
 {
 	struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
 	struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
-	u32 old_index = dentry2offset(old_dentry);
-	u32 new_index = dentry2offset(new_dentry);
+	long old_index = dentry2offset(old_dentry);
+	long new_index = dentry2offset(new_dentry);
 	int ret;
 
 	simple_offset_remove(old_ctx, old_dentry);
@@ -354,9 +389,9 @@ int simple_offset_rename_exchange(struct inode *old_dir,
 
 out_restore:
 	offset_set(old_dentry, old_index);
-	xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
+	mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL);
 	offset_set(new_dentry, new_index);
-	xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
+	mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL);
 	return ret;
 }
 
@@ -369,7 +404,7 @@ out_restore:
  */
 void simple_offset_destroy(struct offset_ctx *octx)
 {
-	xa_destroy(&octx->xa);
+	mtree_destroy(&octx->mt);
 }
 
 /**
@@ -399,15 +434,16 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 
 	/* In this case, ->private_data is protected by f_pos_lock */
 	file->private_data = NULL;
-	return vfs_setpos(file, offset, U32_MAX);
+	return vfs_setpos(file, offset, LONG_MAX);
 }
 
-static struct dentry *offset_find_next(struct xa_state *xas)
+static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
 {
+	MA_STATE(mas, &octx->mt, offset, offset);
 	struct dentry *child, *found = NULL;
 
 	rcu_read_lock();
-	child = xas_next_entry(xas, U32_MAX);
+	child = mas_find(&mas, LONG_MAX);
 	if (!child)
 		goto out;
 	spin_lock(&child->d_lock);
@@ -421,8 +457,8 @@ out:
 
 static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 {
-	u32 offset = dentry2offset(dentry);
 	struct inode *inode = d_inode(dentry);
+	long offset = dentry2offset(dentry);
 
 	return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
 			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
@@ -430,12 +466,11 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 
 static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 {
-	struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
-	XA_STATE(xas, &so_ctx->xa, ctx->pos);
+	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
 	struct dentry *dentry;
 
 	while (true) {
-		dentry = offset_find_next(&xas);
+		dentry = offset_find_next(octx, ctx->pos);
 		if (!dentry)
 			return ERR_PTR(-ENOENT);
 
@@ -444,8 +479,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 			break;
 		}
 
+		ctx->pos = dentry2offset(dentry) + 1;
 		dput(dentry);
-		ctx->pos = xas.xa_index + 1;
 	}
 	return NULL;
 }
@@ -481,7 +516,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx)
 		return 0;
 
 	/* In this case, ->private_data is protected by f_pos_lock */
-	if (ctx->pos == 2)
+	if (ctx->pos == DIR_OFFSET_MIN)
 		file->private_data = NULL;
 	else if (file->private_data == ERR_PTR(-ENOENT))
 		return 0;
@@ -1704,16 +1739,28 @@ bool is_empty_dir_inode(struct inode *inode)
 static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 				const char *str, const struct qstr *name)
 {
-	const struct dentry *parent = READ_ONCE(dentry->d_parent);
-	const struct inode *dir = READ_ONCE(parent->d_inode);
-	const struct super_block *sb = dentry->d_sb;
-	const struct unicode_map *um = sb->s_encoding;
-	struct qstr qstr = QSTR_INIT(str, len);
+	const struct dentry *parent;
+	const struct inode *dir;
 	char strbuf[DNAME_INLINE_LEN];
-	int ret;
+	struct qstr qstr;
+
+	/*
+	 * Attempt a case-sensitive match first. It is cheaper and
+	 * should cover most lookups, including all the sane
+	 * applications that expect a case-sensitive filesystem.
+	 *
+	 * This comparison is safe under RCU because the caller
+	 * guarantees the consistency between str and len. See
+	 * __d_lookup_rcu_op_compare() for details.
+	 */
+	if (len == name->len && !memcmp(str, name->name, len))
+		return 0;
 
+	parent = READ_ONCE(dentry->d_parent);
+	dir = READ_ONCE(parent->d_inode);
 	if (!dir || !IS_CASEFOLDED(dir))
-		goto fallback;
+		return 1;
+
 	/*
 	 * If the dentry name is stored in-line, then it may be concurrently
 	 * modified by a rename.  If this happens, the VFS will eventually retry
@@ -1724,20 +1771,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 	if (len <= DNAME_INLINE_LEN - 1) {
 		memcpy(strbuf, str, len);
 		strbuf[len] = 0;
-		qstr.name = strbuf;
+		str = strbuf;
 		/* prevent compiler from optimizing out the temporary buffer */
 		barrier();
 	}
-	ret = utf8_strncasecmp(um, name, &qstr);
-	if (ret >= 0)
-		return ret;
+	qstr.len = len;
+	qstr.name = str;
 
-	if (sb_has_strict_encoding(sb))
-		return -EINVAL;
-fallback:
-	if (len != name->len)
-		return 1;
-	return !!memcmp(str, name->name, len);
+	return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
 }
 
 /**
@@ -1752,7 +1793,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 	const struct inode *dir = READ_ONCE(dentry->d_inode);
 	struct super_block *sb = dentry->d_sb;
 	const struct unicode_map *um = sb->s_encoding;
-	int ret = 0;
+	int ret;
 
 	if (!dir || !IS_CASEFOLDED(dir))
 		return 0;
@@ -1766,73 +1807,45 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 static const struct dentry_operations generic_ci_dentry_ops = {
 	.d_hash = generic_ci_d_hash,
 	.d_compare = generic_ci_d_compare,
-};
-#endif
-
 #ifdef CONFIG_FS_ENCRYPTION
-static const struct dentry_operations generic_encrypted_dentry_ops = {
 	.d_revalidate = fscrypt_d_revalidate,
+#endif
 };
 #endif
 
-#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
-static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
-	.d_hash = generic_ci_d_hash,
-	.d_compare = generic_ci_d_compare,
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct dentry_operations generic_encrypted_dentry_ops = {
 	.d_revalidate = fscrypt_d_revalidate,
 };
 #endif
 
 /**
- * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
- * @dentry:	dentry to set ops on
- *
- * Casefolded directories need d_hash and d_compare set, so that the dentries
- * contained in them are handled case-insensitively.  Note that these operations
- * are needed on the parent directory rather than on the dentries in it, and
- * while the casefolding flag can be toggled on and off on an empty directory,
- * dentry_operations can't be changed later.  As a result, if the filesystem has
- * casefolding support enabled at all, we have to give all dentries the
- * casefolding operations even if their inode doesn't have the casefolding flag
- * currently (and thus the casefolding ops would be no-ops for now).
- *
- * Encryption works differently in that the only dentry operation it needs is
- * d_revalidate, which it only needs on dentries that have the no-key name flag.
- * The no-key flag can't be set "later", so we don't have to worry about that.
+ * generic_set_sb_d_ops - helper for choosing the set of
+ * filesystem-wide dentry operations for the enabled features
+ * @sb: superblock to be configured
  *
- * Finally, to maximize compatibility with overlayfs (which isn't compatible
- * with certain dentry operations) and to avoid taking an unnecessary
- * performance hit, we use custom dentry_operations for each possible
- * combination rather than always installing all operations.
+ * Filesystems supporting casefolding and/or fscrypt can call this
+ * helper at mount-time to configure sb->s_d_op to best set of dentry
+ * operations required for the enabled features. The helper must be
+ * called after these have been configured, but before the root dentry
+ * is created.
  */
-void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
+void generic_set_sb_d_ops(struct super_block *sb)
 {
-#ifdef CONFIG_FS_ENCRYPTION
-	bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
-#endif
 #if IS_ENABLED(CONFIG_UNICODE)
-	bool needs_ci_ops = dentry->d_sb->s_encoding;
-#endif
-#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
-	if (needs_encrypt_ops && needs_ci_ops) {
-		d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
+	if (sb->s_encoding) {
+		sb->s_d_op = &generic_ci_dentry_ops;
 		return;
 	}
 #endif
 #ifdef CONFIG_FS_ENCRYPTION
-	if (needs_encrypt_ops) {
-		d_set_d_op(dentry, &generic_encrypted_dentry_ops);
-		return;
-	}
-#endif
-#if IS_ENABLED(CONFIG_UNICODE)
-	if (needs_ci_ops) {
-		d_set_d_op(dentry, &generic_ci_dentry_ops);
+	if (sb->s_cop) {
+		sb->s_d_op = &generic_encrypted_dentry_ops;
 		return;
 	}
 #endif
 }
-EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+EXPORT_SYMBOL(generic_set_sb_d_ops);
 
 /**
  * inode_maybe_inc_iversion - increments i_version
@@ -1973,3 +1986,144 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
 	return ts;
 }
 EXPORT_SYMBOL(simple_inode_init_ts);
+
+static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
+{
+	struct dentry *dentry;
+
+	guard(rcu)();
+	dentry = READ_ONCE(stashed);
+	if (!dentry)
+		return NULL;
+	if (!lockref_get_not_dead(&dentry->d_lockref))
+		return NULL;
+	return dentry;
+}
+
+static struct dentry *prepare_anon_dentry(struct dentry **stashed,
+					  unsigned long ino,
+					  struct super_block *sb,
+					  void *data)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+	const struct stashed_operations *sops = sb->s_fs_info;
+
+	dentry = d_alloc_anon(sb);
+	if (!dentry)
+		return ERR_PTR(-ENOMEM);
+
+	inode = new_inode_pseudo(sb);
+	if (!inode) {
+		dput(dentry);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inode->i_ino = ino;
+	inode->i_flags |= S_IMMUTABLE;
+	inode->i_mode = S_IFREG;
+	simple_inode_init_ts(inode);
+	sops->init_inode(inode, data);
+
+	/* Notice when this is changed. */
+	WARN_ON_ONCE(!S_ISREG(inode->i_mode));
+	WARN_ON_ONCE(!IS_IMMUTABLE(inode));
+
+	/* Store address of location where dentry's supposed to be stashed. */
+	dentry->d_fsdata = stashed;
+
+	/* @data is now owned by the fs */
+	d_instantiate(dentry, inode);
+	return dentry;
+}
+
+static struct dentry *stash_dentry(struct dentry **stashed,
+				   struct dentry *dentry)
+{
+	guard(rcu)();
+	for (;;) {
+		struct dentry *old;
+
+		/* Assume any old dentry was cleared out. */
+		old = cmpxchg(stashed, NULL, dentry);
+		if (likely(!old))
+			return dentry;
+
+		/* Check if somebody else installed a reusable dentry. */
+		if (lockref_get_not_dead(&old->d_lockref))
+			return old;
+
+		/* There's an old dead dentry there, try to take it over. */
+		if (likely(try_cmpxchg(stashed, &old, dentry)))
+			return dentry;
+	}
+}
+
+/**
+ * path_from_stashed - create path from stashed or new dentry
+ * @stashed:    where to retrieve or stash dentry
+ * @ino:        inode number to use
+ * @mnt:        mnt of the filesystems to use
+ * @data:       data to store in inode->i_private
+ * @path:       path to create
+ *
+ * The function tries to retrieve a stashed dentry from @stashed. If the dentry
+ * is still valid then it will be reused. If the dentry isn't able the function
+ * will allocate a new dentry and inode. It will then check again whether it
+ * can reuse an existing dentry in case one has been added in the meantime or
+ * update @stashed with the newly added dentry.
+ *
+ * Special-purpose helper for nsfs and pidfs.
+ *
+ * Return: On success zero and on failure a negative error is returned.
+ */
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+		      struct vfsmount *mnt, void *data, struct path *path)
+{
+	struct dentry *dentry;
+	const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
+
+	/* See if dentry can be reused. */
+	path->dentry = get_stashed_dentry(*stashed);
+	if (path->dentry) {
+		sops->put_data(data);
+		goto out_path;
+	}
+
+	/* Allocate a new dentry. */
+	dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, data);
+	if (IS_ERR(dentry)) {
+		sops->put_data(data);
+		return PTR_ERR(dentry);
+	}
+
+	/* Added a new dentry. @data is now owned by the filesystem. */
+	path->dentry = stash_dentry(stashed, dentry);
+	if (path->dentry != dentry)
+		dput(dentry);
+
+out_path:
+	WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+	WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
+	path->mnt = mntget(mnt);
+	return 0;
+}
+
+void stashed_dentry_prune(struct dentry *dentry)
+{
+	struct dentry **stashed = dentry->d_fsdata;
+	struct inode *inode = d_inode(dentry);
+
+	if (WARN_ON_ONCE(!stashed))
+		return;
+
+	if (!inode)
+		return;
+
+	/*
+	 * Only replace our own @dentry as someone else might've
+	 * already cleared out @dentry and stashed their own
+	 * dentry in there.
+	 */
+	cmpxchg(stashed, dentry, NULL);
+}
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 82aa7a35db26..e60a840999aa 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -426,9 +426,7 @@ EXPORT_SYMBOL(mb_cache_destroy);
 
 static int __init mbcache_init(void)
 {
-	mb_entry_cache = kmem_cache_create("mbcache",
-				sizeof(struct mb_cache_entry), 0,
-				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT);
 	if (!mb_entry_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 73f37f298087..7cbd2b9f4d11 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -87,7 +87,7 @@ static int __init init_inodecache(void)
 	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
 					     sizeof(struct minix_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (minix_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 64c5205e2b5e..3c60f1eaca61 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -214,7 +214,7 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from,
 	 * anything at all.
 	 */
 	if (nr_extents == 0)
-		return 0;
+		return -EINVAL;
 
 	/*
 	 * Here we know that nr_extents is greater than zero which means
diff --git a/fs/mpage.c b/fs/mpage.c
index 738882e0766d..fa8b99a199fa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,7 @@ alloc_new:
 				GFP_NOFS);
 		bio->bi_iter.bi_sector = first_block << (blkbits - 9);
 		wbc_init_bio(wbc, bio);
+		bio->bi_write_hint = inode->i_write_hint;
 	}
 
 	/*
diff --git a/fs/namei.c b/fs/namei.c
index 4e0de939fea1..d0c4a3e9278e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1717,7 +1717,11 @@ static inline int may_lookup(struct mnt_idmap *idmap,
 {
 	if (nd->flags & LOOKUP_RCU) {
 		int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
-		if (err != -ECHILD || !try_to_unlazy(nd))
+		if (!err)		// success, keep going
+			return 0;
+		if (!try_to_unlazy(nd))
+			return -ECHILD;	// redo it all non-lazy
+		if (err != -ECHILD)	// hard error
 			return err;
 	}
 	return inode_permission(idmap, nd->inode, MAY_EXEC);
@@ -2676,10 +2680,8 @@ static int lookup_one_common(struct mnt_idmap *idmap,
 	if (!len)
 		return -EACCES;
 
-	if (unlikely(name[0] == '.')) {
-		if (len < 2 || (len == 2 && name[1] == '.'))
-			return -EACCES;
-	}
+	if (is_dot_dotdot(name, len))
+		return -EACCES;
 
 	while (len--) {
 		unsigned int c = *(const unsigned char *)name++;
diff --git a/fs/namespace.c b/fs/namespace.c
index 437f60e96d40..5a51315c6678 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4472,10 +4472,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
 	/*
 	 * If this is an attached mount make sure it's located in the callers
 	 * mount namespace. If it's not don't let the caller interact with it.
-	 * If this is a detached mount make sure it has an anonymous mount
-	 * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
+	 *
+	 * If this mount doesn't have a parent it's most often simply a
+	 * detached mount with an anonymous mount namespace. IOW, something
+	 * that's simply not attached yet. But there are apparently also users
+	 * that do change mount properties on the rootfs itself. That obviously
+	 * neither has a parent nor is it a detached mount so we cannot
+	 * unconditionally check for detached mounts.
 	 */
-	if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
+	if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
 		goto out;
 
 	/*
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index a59e7b2edaac..3298c29b5548 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -101,7 +101,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 		}
 
 		if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
-			if (folio_index(folio) == rreq->no_unlock_folio &&
+			if (folio->index == rreq->no_unlock_folio &&
 			    test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
 				_debug("no unlock");
 			else
@@ -246,13 +246,13 @@ EXPORT_SYMBOL(netfs_readahead);
  */
 int netfs_read_folio(struct file *file, struct folio *folio)
 {
-	struct address_space *mapping = folio_file_mapping(folio);
+	struct address_space *mapping = folio->mapping;
 	struct netfs_io_request *rreq;
 	struct netfs_inode *ctx = netfs_inode(mapping->host);
 	struct folio *sink = NULL;
 	int ret;
 
-	_enter("%lx", folio_index(folio));
+	_enter("%lx", folio->index);
 
 	rreq = netfs_alloc_request(mapping, file,
 				   folio_file_pos(folio), folio_size(folio),
@@ -460,7 +460,7 @@ retry:
 		ret = PTR_ERR(rreq);
 		goto error;
 	}
-	rreq->no_unlock_folio	= folio_index(folio);
+	rreq->no_unlock_folio	= folio->index;
 	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
 
 	ret = netfs_begin_cache_read(rreq, ctx);
@@ -518,7 +518,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 			     size_t offset, size_t len)
 {
 	struct netfs_io_request *rreq;
-	struct address_space *mapping = folio_file_mapping(folio);
+	struct address_space *mapping = folio->mapping;
 	struct netfs_inode *ctx = netfs_inode(mapping->host);
 	unsigned long long start = folio_pos(folio);
 	size_t flen = folio_size(folio);
@@ -535,7 +535,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 		goto error;
 	}
 
-	rreq->no_unlock_folio = folio_index(folio);
+	rreq->no_unlock_folio = folio->index;
 	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
 	ret = netfs_begin_cache_read(rreq, ctx);
 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 93dc76f34e39..9a0d32e4b422 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -221,10 +221,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 		if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
 			break;
 
-		ret = -ENOMEM;
 		folio = netfs_grab_folio_for_write(mapping, pos, part);
-		if (!folio)
+		if (IS_ERR(folio)) {
+			ret = PTR_ERR(folio);
 			break;
+		}
 
 		flen = folio_size(folio);
 		offset = pos & (flen - 1);
@@ -343,7 +344,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 			break;
 		default:
 			WARN(true, "Unexpected modify type %u ix=%lx\n",
-			     howto, folio_index(folio));
+			     howto, folio->index);
 			ret = -EIO;
 			goto error_folio_unlock;
 		}
@@ -476,6 +477,9 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
 
+	if (!iov_iter_count(from))
+		return 0;
+
 	if ((iocb->ki_flags & IOCB_DIRECT) ||
 	    test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
 		return netfs_unbuffered_write_iter(iocb, from);
@@ -648,7 +652,7 @@ static void netfs_pages_written_back(struct netfs_io_request *wreq)
 	xas_for_each(&xas, folio, last) {
 		WARN(!folio_test_writeback(folio),
 		     "bad %zx @%llx page %lx %lx\n",
-		     wreq->len, wreq->start, folio_index(folio), last);
+		     wreq->len, wreq->start, folio->index, last);
 
 		if ((finfo = netfs_folio_info(folio))) {
 			/* Streaming writes cannot be redirtied whilst under
@@ -795,7 +799,7 @@ static void netfs_extend_writeback(struct address_space *mapping,
 				continue;
 			if (xa_is_value(folio))
 				break;
-			if (folio_index(folio) != index) {
+			if (folio->index != index) {
 				xas_reset(xas);
 				break;
 			}
@@ -901,7 +905,7 @@ static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
 	long count = wbc->nr_to_write;
 	int ret;
 
-	_enter(",%lx,%llx-%llx,%u", folio_index(folio), start, end, caching);
+	_enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
 
 	wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
 				   NETFS_WRITEBACK);
@@ -1047,7 +1051,7 @@ search_again:
 
 	start = folio_pos(folio); /* May regress with THPs */
 
-	_debug("wback %lx", folio_index(folio));
+	_debug("wback %lx", folio->index);
 
 	/* At this point we hold neither the i_pages lock nor the page lock:
 	 * the page may be truncated or invalidated (changing page->mapping to
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 60a40d293c87..bee047e20f5d 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -139,6 +139,9 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
 
+	if (!iov_iter_count(from))
+		return 0;
+
 	trace_netfs_write_iter(iocb, from);
 	netfs_stat(&netfs_n_rh_dio_write);
 
@@ -146,7 +149,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret < 0)
 		return ret;
 	ret = generic_write_checks(iocb, from);
-	if (ret < 0)
+	if (ret <= 0)
 		goto out;
 	ret = file_remove_privs(file);
 	if (ret < 0)
diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c
index d645f8b302a2..9397ed39b0b4 100644
--- a/fs/netfs/fscache_cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache);
 void fscache_put_cache(struct fscache_cache *cache,
 		       enum fscache_cache_trace where)
 {
-	unsigned int debug_id = cache->debug_id;
+	unsigned int debug_id;
 	bool zero;
 	int ref;
 
 	if (IS_ERR_OR_NULL(cache))
 		return;
 
+	debug_id = cache->debug_id;
 	zero = __refcount_dec_and_test(&cache->ref, &ref);
 	trace_fscache_cache(debug_id, ref - 1, where);
 
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 4309edf33862..4261ad6c55b6 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -124,7 +124,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
 			/* We might have multiple writes from the same huge
 			 * folio, but we mustn't unlock a folio more than once.
 			 */
-			if (have_unlocked && folio_index(folio) <= unlocked)
+			if (have_unlocked && folio->index <= unlocked)
 				continue;
 			unlocked = folio_next_index(folio) - 1;
 			trace_netfs_folio(folio, netfs_folio_trace_end_copy);
@@ -748,6 +748,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 
 	if (!rreq->submitted) {
 		netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+		if (rreq->origin == NETFS_DIO_READ)
+			inode_dio_end(rreq->inode);
 		ret = 0;
 		goto out;
 	}
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 0e3af37fc924..90051ced8e2a 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -180,7 +180,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 	struct netfs_folio *finfo = NULL;
 	size_t flen = folio_size(folio);
 
-	_enter("{%lx},%zx,%zx", folio_index(folio), offset, length);
+	_enter("{%lx},%zx,%zx", folio->index, offset, length);
 
 	folio_wait_fscache(folio);
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 44eca51b2808..fbdc9ca80f71 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -246,7 +246,7 @@ void nfs_free_client(struct nfs_client *clp)
 	put_nfs_version(clp->cl_nfs_mod);
 	kfree(clp->cl_hostname);
 	kfree(clp->cl_acceptor);
-	kfree(clp);
+	kfree_rcu(clp, rcu);
 }
 EXPORT_SYMBOL_GPL(nfs_free_client);
 
@@ -1006,6 +1006,14 @@ struct nfs_server *nfs_alloc_server(void)
 }
 EXPORT_SYMBOL_GPL(nfs_alloc_server);
 
+static void delayed_free(struct rcu_head *p)
+{
+	struct nfs_server *server = container_of(p, struct nfs_server, rcu);
+
+	nfs_free_iostats(server->io_stats);
+	kfree(server);
+}
+
 /*
  * Free up a server record
  */
@@ -1031,10 +1039,9 @@ void nfs_free_server(struct nfs_server *server)
 
 	ida_destroy(&server->lockowner_id);
 	ida_destroy(&server->openowner_id);
-	nfs_free_iostats(server->io_stats);
 	put_cred(server->cred);
-	kfree(server);
 	nfs_release_automount_timer();
+	call_rcu(&server->rcu, delayed_free);
 }
 EXPORT_SYMBOL_GPL(nfs_free_server);
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c8ecbe999059..ac505671efbd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1431,9 +1431,9 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry)
 static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
 {
 	struct inode *inode = d_inode(dentry);
-	struct inode *dir = d_inode(dentry->d_parent);
+	struct inode *dir = d_inode_rcu(dentry->d_parent);
 
-	if (!nfs_verify_change_attribute(dir, verf))
+	if (!dir || !nfs_verify_change_attribute(dir, verf))
 		return;
 	if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
 		nfs_set_verifier_delegated(&verf);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6d52ecba8e9c..9257425cbd1a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4945,10 +4945,8 @@ nfsd_break_deleg_cb(struct file_lease *fl)
 	 */
 	fl->fl_break_time = 0;
 
-	spin_lock(&fp->fi_lock);
 	fp->fi_had_conflict = true;
 	nfsd_break_one_deleg(dp);
-	spin_unlock(&fp->fi_lock);
 	return false;
 }
 
@@ -5557,12 +5555,13 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	if (status)
 		goto out_unlock;
 
+	status = -EAGAIN;
+	if (fp->fi_had_conflict)
+		goto out_unlock;
+
 	spin_lock(&state_lock);
 	spin_lock(&fp->fi_lock);
-	if (fp->fi_had_conflict)
-		status = -EAGAIN;
-	else
-		status = hash_delegation_locked(dp, fp);
+	status = hash_delegation_locked(dp, fp);
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
 
@@ -7911,14 +7910,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 {
 	struct file_lock *fl;
 	int status = false;
-	struct nfsd_file *nf = find_any_file(fp);
+	struct nfsd_file *nf;
 	struct inode *inode;
 	struct file_lock_context *flctx;
 
+	spin_lock(&fp->fi_lock);
+	nf = find_any_file_locked(fp);
 	if (!nf) {
 		/* Any valid lock stateid should have some sort of access */
 		WARN_ON_ONCE(1);
-		return status;
+		goto out;
 	}
 
 	inode = file_inode(nf->nf_file);
@@ -7934,7 +7935,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 		}
 		spin_unlock(&flctx->flc_lock);
 	}
-	nfsd_file_put(nf);
+out:
+	spin_unlock(&fp->fi_lock);
 	return status;
 }
 
@@ -7944,10 +7946,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
  * @cstate: NFSv4 COMPOUND state
  * @u: RELEASE_LOCKOWNER arguments
  *
- * The lockowner's so_count is bumped when a lock record is added
- * or when copying a conflicting lock. The latter case is brief,
- * but can lead to fleeting false positives when looking for
- * locks-in-use.
+ * Check if theree are any locks still held and if not - free the lockowner
+ * and any lock state that is owned.
  *
  * Return values:
  *   %nfs_ok: lockowner released or not found
@@ -7983,10 +7983,13 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 		spin_unlock(&clp->cl_lock);
 		return nfs_ok;
 	}
-	if (atomic_read(&lo->lo_owner.so_count) != 2) {
-		spin_unlock(&clp->cl_lock);
-		nfs4_put_stateowner(&lo->lo_owner);
-		return nfserr_locks_held;
+
+	list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
+		if (check_for_locks(stp->st_stid.sc_file, lo)) {
+			spin_unlock(&clp->cl_lock);
+			nfs4_put_stateowner(&lo->lo_owner);
+			return nfserr_locks_held;
+		}
 	}
 	unhash_lockowner_locked(lo);
 	while (!list_empty(&lo->lo_owner.so_stateids)) {
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index bec33b89a075..0e3fc5ba33c7 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -107,7 +107,13 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
 	nilfs_transaction_commit(inode->i_sb);
 
  mapped:
-	folio_wait_stable(folio);
+	/*
+	 * Since checksumming including data blocks is performed to determine
+	 * the validity of the log to be written and used for recovery, it is
+	 * necessary to wait for writeback to finish here, regardless of the
+	 * stable write requirement of the backing device.
+	 */
+	folio_wait_writeback(folio);
  out:
 	sb_end_pagefault(inode->i_sb);
 	return vmf_fs_error(ret);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 0955b657938f..a9b8d77c8c1d 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 
 static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 				     struct nilfs_recovery_block *rb,
-				     struct page *page)
+				     loff_t pos, struct page *page)
 {
 	struct buffer_head *bh_org;
+	size_t from = pos & ~PAGE_MASK;
 	void *kaddr;
 
 	bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
@@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 		return -EIO;
 
 	kaddr = kmap_atomic(page);
-	memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+	memcpy(kaddr + from, bh_org->b_data, bh_org->b_size);
 	kunmap_atomic(kaddr);
 	brelse(bh_org);
 	return 0;
@@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 			goto failed_inode;
 		}
 
-		err = nilfs_recovery_copy_block(nilfs, rb, page);
+		err = nilfs_recovery_copy_block(nilfs, rb, pos, page);
 		if (unlikely(err))
 			goto failed_page;
 
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
 
 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
 				    b_assoc_buffers) {
-			set_buffer_async_write(bh);
 			if (bh == segbuf->sb_super_root) {
 				if (bh->b_folio != bd_folio) {
 					folio_lock(bd_folio);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
 				}
 				break;
 			}
+			set_buffer_async_write(bh);
 			if (bh->b_folio != fs_folio) {
 				nilfs_begin_folio_io(fs_folio);
 				fs_folio = bh->b_folio;
@@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
 
 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
 				    b_assoc_buffers) {
-			clear_buffer_async_write(bh);
 			if (bh == segbuf->sb_super_root) {
 				clear_buffer_uptodate(bh);
 				if (bh->b_folio != bd_folio) {
@@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
 				}
 				break;
 			}
+			clear_buffer_async_write(bh);
 			if (bh->b_folio != fs_folio) {
 				nilfs_end_folio_io(fs_folio, err);
 				fs_folio = bh->b_folio;
@@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 				 BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
 				 BIT(BH_NILFS_Redirected));
 
-			set_mask_bits(&bh->b_state, clear_bits, set_bits);
 			if (bh == segbuf->sb_super_root) {
+				set_buffer_uptodate(bh);
+				clear_buffer_dirty(bh);
 				if (bh->b_folio != bd_folio) {
 					folio_end_writeback(bd_folio);
 					bd_folio = bh->b_folio;
@@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 				update_sr = true;
 				break;
 			}
+			set_mask_bits(&bh->b_state, clear_bits, set_bits);
 			if (bh->b_folio != fs_folio) {
 				nilfs_end_folio_io(fs_folio, 0);
 				fs_folio = bh->b_folio;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 34e1e3e36733..7aaafb5cb9fc 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -27,26 +27,17 @@ static const struct file_operations ns_file_operations = {
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	struct inode *inode = d_inode(dentry);
-	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+	struct ns_common *ns = inode->i_private;
+	const struct proc_ns_operations *ns_ops = ns->ops;
 
 	return dynamic_dname(buffer, buflen, "%s:[%lu]",
 		ns_ops->name, inode->i_ino);
 }
 
-static void ns_prune_dentry(struct dentry *dentry)
-{
-	struct inode *inode = d_inode(dentry);
-	if (inode) {
-		struct ns_common *ns = inode->i_private;
-		atomic_long_set(&ns->stashed, 0);
-	}
-}
-
-const struct dentry_operations ns_dentry_operations =
-{
-	.d_prune	= ns_prune_dentry,
+const struct dentry_operations ns_dentry_operations = {
 	.d_delete	= always_delete_dentry,
 	.d_dname	= ns_dname,
+	.d_prune	= stashed_dentry_prune,
 };
 
 static void nsfs_evict(struct inode *inode)
@@ -56,67 +47,16 @@ static void nsfs_evict(struct inode *inode)
 	ns->ops->put(ns);
 }
 
-static int __ns_get_path(struct path *path, struct ns_common *ns)
-{
-	struct vfsmount *mnt = nsfs_mnt;
-	struct dentry *dentry;
-	struct inode *inode;
-	unsigned long d;
-
-	rcu_read_lock();
-	d = atomic_long_read(&ns->stashed);
-	if (!d)
-		goto slow;
-	dentry = (struct dentry *)d;
-	if (!lockref_get_not_dead(&dentry->d_lockref))
-		goto slow;
-	rcu_read_unlock();
-	ns->ops->put(ns);
-got_it:
-	path->mnt = mntget(mnt);
-	path->dentry = dentry;
-	return 0;
-slow:
-	rcu_read_unlock();
-	inode = new_inode_pseudo(mnt->mnt_sb);
-	if (!inode) {
-		ns->ops->put(ns);
-		return -ENOMEM;
-	}
-	inode->i_ino = ns->inum;
-	simple_inode_init_ts(inode);
-	inode->i_flags |= S_IMMUTABLE;
-	inode->i_mode = S_IFREG | S_IRUGO;
-	inode->i_fop = &ns_file_operations;
-	inode->i_private = ns;
-
-	dentry = d_make_root(inode);	/* not the normal use, but... */
-	if (!dentry)
-		return -ENOMEM;
-	dentry->d_fsdata = (void *)ns->ops;
-	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
-	if (d) {
-		d_delete(dentry);	/* make sure ->d_prune() does nothing */
-		dput(dentry);
-		cpu_relax();
-		return -EAGAIN;
-	}
-	goto got_it;
-}
-
 int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		     void *private_data)
 {
-	int ret;
+	struct ns_common *ns;
 
-	do {
-		struct ns_common *ns = ns_get_cb(private_data);
-		if (!ns)
-			return -ENOENT;
-		ret = __ns_get_path(path, ns);
-	} while (ret == -EAGAIN);
+	ns = ns_get_cb(private_data);
+	if (!ns)
+		return -ENOENT;
 
-	return ret;
+	return path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt, ns, path);
 }
 
 struct ns_get_path_task_args {
@@ -146,6 +86,7 @@ int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns))
 {
 	struct path path = {};
+	struct ns_common *relative;
 	struct file *f;
 	int err;
 	int fd;
@@ -154,19 +95,15 @@ int open_related_ns(struct ns_common *ns,
 	if (fd < 0)
 		return fd;
 
-	do {
-		struct ns_common *relative;
-
-		relative = get_ns(ns);
-		if (IS_ERR(relative)) {
-			put_unused_fd(fd);
-			return PTR_ERR(relative);
-		}
-
-		err = __ns_get_path(&path, relative);
-	} while (err == -EAGAIN);
+	relative = get_ns(ns);
+	if (IS_ERR(relative)) {
+		put_unused_fd(fd);
+		return PTR_ERR(relative);
+	}
 
-	if (err) {
+	err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
+				relative, &path);
+	if (err < 0) {
 		put_unused_fd(fd);
 		return err;
 	}
@@ -249,7 +186,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
 static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
-	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+	const struct ns_common *ns = inode->i_private;
+	const struct proc_ns_operations *ns_ops = ns->ops;
 
 	seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
 	return 0;
@@ -261,6 +199,24 @@ static const struct super_operations nsfs_ops = {
 	.show_path = nsfs_show_path,
 };
 
+static void nsfs_init_inode(struct inode *inode, void *data)
+{
+	inode->i_private = data;
+	inode->i_mode |= S_IRUGO;
+	inode->i_fop = &ns_file_operations;
+}
+
+static void nsfs_put_data(void *data)
+{
+	struct ns_common *ns = data;
+	ns->ops->put(ns);
+}
+
+static const struct stashed_operations nsfs_stashed_ops = {
+	.init_inode = nsfs_init_inode,
+	.put_data = nsfs_put_data,
+};
+
 static int nsfs_init_fs_context(struct fs_context *fc)
 {
 	struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
@@ -268,6 +224,7 @@ static int nsfs_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 	ctx->ops = &nsfs_ops;
 	ctx->dops = &ns_dentry_operations;
+	fc->s_fs_info = (void *)&nsfs_stashed_ops;
 	return 0;
 }
 
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
deleted file mode 100644
index 7b2509741735..000000000000
--- a/fs/ntfs/Kconfig
+++ /dev/null
@@ -1,81 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config NTFS_FS
-	tristate "NTFS file system support"
-	select BUFFER_HEAD
-	select NLS
-	help
-	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
-
-	  Saying Y or M here enables read support.  There is partial, but
-	  safe, write support available.  For write support you must also
-	  say Y to "NTFS write support" below.
-
-	  There are also a number of user-space tools available, called
-	  ntfsprogs.  These include ntfsundelete and ntfsresize, that work
-	  without NTFS support enabled in the kernel.
-
-	  This is a rewrite from scratch of Linux NTFS support and replaced
-	  the old NTFS code starting with Linux 2.5.11.  A backport to
-	  the Linux 2.4 kernel series is separately available as a patch
-	  from the project web site.
-
-	  For more information see <file:Documentation/filesystems/ntfs.rst>
-	  and <http://www.linux-ntfs.org/>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ntfs.
-
-	  If you are not using Windows NT, 2000, XP or 2003 in addition to
-	  Linux on your computer it is safe to say N.
-
-config NTFS_DEBUG
-	bool "NTFS debugging support"
-	depends on NTFS_FS
-	help
-	  If you are experiencing any problems with the NTFS file system, say
-	  Y here.  This will result in additional consistency checks to be
-	  performed by the driver as well as additional debugging messages to
-	  be written to the system log.  Note that debugging messages are
-	  disabled by default.  To enable them, supply the option debug_msgs=1
-	  at the kernel command line when booting the kernel or as an option
-	  to insmod when loading the ntfs module.  Once the driver is active,
-	  you can enable debugging messages by doing (as root):
-	  echo 1 > /proc/sys/fs/ntfs-debug
-	  Replacing the "1" with "0" would disable debug messages.
-
-	  If you leave debugging messages disabled, this results in little
-	  overhead, but enabling debug messages results in very significant
-	  slowdown of the system.
-
-	  When reporting bugs, please try to have available a full dump of
-	  debugging messages while the misbehaviour was occurring.
-
-config NTFS_RW
-	bool "NTFS write support"
-	depends on NTFS_FS
-	depends on PAGE_SIZE_LESS_THAN_64KB
-	help
-	  This enables the partial, but safe, write support in the NTFS driver.
-
-	  The only supported operation is overwriting existing files, without
-	  changing the file length.  No file or directory creation, deletion or
-	  renaming is possible.  Note only non-resident files can be written to
-	  so you may find that some very small files (<500 bytes or so) cannot
-	  be written to.
-
-	  While we cannot guarantee that it will not damage any data, we have
-	  so far not received a single report where the driver would have
-	  damaged someones data so we assume it is perfectly safe to use.
-
-	  Note:  While write support is safe in this version (a rewrite from
-	  scratch of the NTFS support), it should be noted that the old NTFS
-	  write support, included in Linux 2.5.10 and before (since 1997),
-	  is not safe.
-
-	  This is currently useful with TopologiLinux.  TopologiLinux is run
-	  on top of any DOS/Microsoft Windows system without partitioning your
-	  hard disk.  Unlike other Linux distributions TopologiLinux does not
-	  need its own partition.  For more information see
-	  <http://topologi-linux.sourceforge.net/>
-
-	  It is perfectly safe to say N here.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
deleted file mode 100644
index 3e736572ed00..000000000000
--- a/fs/ntfs/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Rules for making the NTFS driver.
-
-obj-$(CONFIG_NTFS_FS) += ntfs.o
-
-ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
-	  index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
-	  unistr.o upcase.o
-
-ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-
-ccflags-y := -DNTFS_VERSION=\"2.1.32\"
-ccflags-$(CONFIG_NTFS_DEBUG)	+= -DDEBUG
-ccflags-$(CONFIG_NTFS_RW)	+= -DNTFS_RW
-
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
deleted file mode 100644
index 2d01517a2d59..000000000000
--- a/fs/ntfs/aops.c
+++ /dev/null
@@ -1,1744 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * aops.c - NTFS kernel address space operations and page cache handling.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include <linux/bit_spinlock.h>
-#include <linux/bio.h>
-
-#include "aops.h"
-#include "attrib.h"
-#include "debug.h"
-#include "inode.h"
-#include "mft.h"
-#include "runlist.h"
-#include "types.h"
-#include "ntfs.h"
-
-/**
- * ntfs_end_buffer_async_read - async io completion for reading attributes
- * @bh:		buffer head on which io is completed
- * @uptodate:	whether @bh is now uptodate or not
- *
- * Asynchronous I/O completion handler for reading pages belonging to the
- * attribute address space of an inode.  The inodes can either be files or
- * directories or they can be fake inodes describing some attribute.
- *
- * If NInoMstProtected(), perform the post read mst fixups when all IO on the
- * page has been completed and mark the page uptodate or set the error bit on
- * the page.  To determine the size of the records that need fixing up, we
- * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
- * record size, and index_block_size_bits, to the log(base 2) of the ntfs
- * record size.
- */
-static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
-{
-	unsigned long flags;
-	struct buffer_head *first, *tmp;
-	struct page *page;
-	struct inode *vi;
-	ntfs_inode *ni;
-	int page_uptodate = 1;
-
-	page = bh->b_page;
-	vi = page->mapping->host;
-	ni = NTFS_I(vi);
-
-	if (likely(uptodate)) {
-		loff_t i_size;
-		s64 file_ofs, init_size;
-
-		set_buffer_uptodate(bh);
-
-		file_ofs = ((s64)page->index << PAGE_SHIFT) +
-				bh_offset(bh);
-		read_lock_irqsave(&ni->size_lock, flags);
-		init_size = ni->initialized_size;
-		i_size = i_size_read(vi);
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (unlikely(init_size > i_size)) {
-			/* Race with shrinking truncate. */
-			init_size = i_size;
-		}
-		/* Check for the current buffer head overflowing. */
-		if (unlikely(file_ofs + bh->b_size > init_size)) {
-			int ofs;
-			void *kaddr;
-
-			ofs = 0;
-			if (file_ofs < init_size)
-				ofs = init_size - file_ofs;
-			kaddr = kmap_atomic(page);
-			memset(kaddr + bh_offset(bh) + ofs, 0,
-					bh->b_size - ofs);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr);
-		}
-	} else {
-		clear_buffer_uptodate(bh);
-		SetPageError(page);
-		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
-				"0x%llx.", (unsigned long long)bh->b_blocknr);
-	}
-	first = page_buffers(page);
-	spin_lock_irqsave(&first->b_uptodate_lock, flags);
-	clear_buffer_async_read(bh);
-	unlock_buffer(bh);
-	tmp = bh;
-	do {
-		if (!buffer_uptodate(tmp))
-			page_uptodate = 0;
-		if (buffer_async_read(tmp)) {
-			if (likely(buffer_locked(tmp)))
-				goto still_busy;
-			/* Async buffers must be locked. */
-			BUG();
-		}
-		tmp = tmp->b_this_page;
-	} while (tmp != bh);
-	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
-	/*
-	 * If none of the buffers had errors then we can set the page uptodate,
-	 * but we first have to perform the post read mst fixups, if the
-	 * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
-	 * Note we ignore fixup errors as those are detected when
-	 * map_mft_record() is called which gives us per record granularity
-	 * rather than per page granularity.
-	 */
-	if (!NInoMstProtected(ni)) {
-		if (likely(page_uptodate && !PageError(page)))
-			SetPageUptodate(page);
-	} else {
-		u8 *kaddr;
-		unsigned int i, recs;
-		u32 rec_size;
-
-		rec_size = ni->itype.index.block_size;
-		recs = PAGE_SIZE / rec_size;
-		/* Should have been verified before we got here... */
-		BUG_ON(!recs);
-		kaddr = kmap_atomic(page);
-		for (i = 0; i < recs; i++)
-			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
-					i * rec_size), rec_size);
-		kunmap_atomic(kaddr);
-		flush_dcache_page(page);
-		if (likely(page_uptodate && !PageError(page)))
-			SetPageUptodate(page);
-	}
-	unlock_page(page);
-	return;
-still_busy:
-	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
-	return;
-}
-
-/**
- * ntfs_read_block - fill a @folio of an address space with data
- * @folio:	page cache folio to fill with data
- *
- * We read each buffer asynchronously and when all buffers are read in, our io
- * completion handler ntfs_end_buffer_read_async(), if required, automatically
- * applies the mst fixups to the folio before finally marking it uptodate and
- * unlocking it.
- *
- * We only enforce allocated_size limit because i_size is checked for in
- * generic_file_read().
- *
- * Return 0 on success and -errno on error.
- *
- * Contains an adapted version of fs/buffer.c::block_read_full_folio().
- */
-static int ntfs_read_block(struct folio *folio)
-{
-	loff_t i_size;
-	VCN vcn;
-	LCN lcn;
-	s64 init_size;
-	struct inode *vi;
-	ntfs_inode *ni;
-	ntfs_volume *vol;
-	runlist_element *rl;
-	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
-	sector_t iblock, lblock, zblock;
-	unsigned long flags;
-	unsigned int blocksize, vcn_ofs;
-	int i, nr;
-	unsigned char blocksize_bits;
-
-	vi = folio->mapping->host;
-	ni = NTFS_I(vi);
-	vol = ni->vol;
-
-	/* $MFT/$DATA must have its complete runlist in memory at all times. */
-	BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
-
-	blocksize = vol->sb->s_blocksize;
-	blocksize_bits = vol->sb->s_blocksize_bits;
-
-	head = folio_buffers(folio);
-	if (!head)
-		head = create_empty_buffers(folio, blocksize, 0);
-	bh = head;
-
-	/*
-	 * We may be racing with truncate.  To avoid some of the problems we
-	 * now take a snapshot of the various sizes and use those for the whole
-	 * of the function.  In case of an extending truncate it just means we
-	 * may leave some buffers unmapped which are now allocated.  This is
-	 * not a problem since these buffers will just get mapped when a write
-	 * occurs.  In case of a shrinking truncate, we will detect this later
-	 * on due to the runlist being incomplete and if the folio is being
-	 * fully truncated, truncate will throw it away as soon as we unlock
-	 * it so no need to worry what we do with it.
-	 */
-	iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
-	read_lock_irqsave(&ni->size_lock, flags);
-	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
-	init_size = ni->initialized_size;
-	i_size = i_size_read(vi);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (unlikely(init_size > i_size)) {
-		/* Race with shrinking truncate. */
-		init_size = i_size;
-	}
-	zblock = (init_size + blocksize - 1) >> blocksize_bits;
-
-	/* Loop through all the buffers in the folio. */
-	rl = NULL;
-	nr = i = 0;
-	do {
-		int err = 0;
-
-		if (unlikely(buffer_uptodate(bh)))
-			continue;
-		if (unlikely(buffer_mapped(bh))) {
-			arr[nr++] = bh;
-			continue;
-		}
-		bh->b_bdev = vol->sb->s_bdev;
-		/* Is the block within the allowed limits? */
-		if (iblock < lblock) {
-			bool is_retry = false;
-
-			/* Convert iblock into corresponding vcn and offset. */
-			vcn = (VCN)iblock << blocksize_bits >>
-					vol->cluster_size_bits;
-			vcn_ofs = ((VCN)iblock << blocksize_bits) &
-					vol->cluster_size_mask;
-			if (!rl) {
-lock_retry_remap:
-				down_read(&ni->runlist.lock);
-				rl = ni->runlist.rl;
-			}
-			if (likely(rl != NULL)) {
-				/* Seek to element containing target vcn. */
-				while (rl->length && rl[1].vcn <= vcn)
-					rl++;
-				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-			} else
-				lcn = LCN_RL_NOT_MAPPED;
-			/* Successful remap. */
-			if (lcn >= 0) {
-				/* Setup buffer head to correct block. */
-				bh->b_blocknr = ((lcn << vol->cluster_size_bits)
-						+ vcn_ofs) >> blocksize_bits;
-				set_buffer_mapped(bh);
-				/* Only read initialized data blocks. */
-				if (iblock < zblock) {
-					arr[nr++] = bh;
-					continue;
-				}
-				/* Fully non-initialized data block, zero it. */
-				goto handle_zblock;
-			}
-			/* It is a hole, need to zero it. */
-			if (lcn == LCN_HOLE)
-				goto handle_hole;
-			/* If first try and runlist unmapped, map and retry. */
-			if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-				is_retry = true;
-				/*
-				 * Attempt to map runlist, dropping lock for
-				 * the duration.
-				 */
-				up_read(&ni->runlist.lock);
-				err = ntfs_map_runlist(ni, vcn);
-				if (likely(!err))
-					goto lock_retry_remap;
-				rl = NULL;
-			} else if (!rl)
-				up_read(&ni->runlist.lock);
-			/*
-			 * If buffer is outside the runlist, treat it as a
-			 * hole.  This can happen due to concurrent truncate
-			 * for example.
-			 */
-			if (err == -ENOENT || lcn == LCN_ENOENT) {
-				err = 0;
-				goto handle_hole;
-			}
-			/* Hard error, zero out region. */
-			if (!err)
-				err = -EIO;
-			bh->b_blocknr = -1;
-			folio_set_error(folio);
-			ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
-					"attribute type 0x%x, vcn 0x%llx, "
-					"offset 0x%x because its location on "
-					"disk could not be determined%s "
-					"(error code %i).", ni->mft_no,
-					ni->type, (unsigned long long)vcn,
-					vcn_ofs, is_retry ? " even after "
-					"retrying" : "", err);
-		}
-		/*
-		 * Either iblock was outside lblock limits or
-		 * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
-		 * of the folio and set the buffer uptodate.
-		 */
-handle_hole:
-		bh->b_blocknr = -1UL;
-		clear_buffer_mapped(bh);
-handle_zblock:
-		folio_zero_range(folio, i * blocksize, blocksize);
-		if (likely(!err))
-			set_buffer_uptodate(bh);
-	} while (i++, iblock++, (bh = bh->b_this_page) != head);
-
-	/* Release the lock if we took it. */
-	if (rl)
-		up_read(&ni->runlist.lock);
-
-	/* Check we have at least one buffer ready for i/o. */
-	if (nr) {
-		struct buffer_head *tbh;
-
-		/* Lock the buffers. */
-		for (i = 0; i < nr; i++) {
-			tbh = arr[i];
-			lock_buffer(tbh);
-			tbh->b_end_io = ntfs_end_buffer_async_read;
-			set_buffer_async_read(tbh);
-		}
-		/* Finally, start i/o on the buffers. */
-		for (i = 0; i < nr; i++) {
-			tbh = arr[i];
-			if (likely(!buffer_uptodate(tbh)))
-				submit_bh(REQ_OP_READ, tbh);
-			else
-				ntfs_end_buffer_async_read(tbh, 1);
-		}
-		return 0;
-	}
-	/* No i/o was scheduled on any of the buffers. */
-	if (likely(!folio_test_error(folio)))
-		folio_mark_uptodate(folio);
-	else /* Signal synchronous i/o error. */
-		nr = -EIO;
-	folio_unlock(folio);
-	return nr;
-}
-
-/**
- * ntfs_read_folio - fill a @folio of a @file with data from the device
- * @file:	open file to which the folio @folio belongs or NULL
- * @folio:	page cache folio to fill with data
- *
- * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
- * file @file by calling the ntfs version of the generic block_read_full_folio()
- * function, ntfs_read_block(), which in turn creates and reads in the buffers
- * associated with the folio asynchronously.
- *
- * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
- * data from the mft record (which at this stage is most likely in memory) and
- * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
- * even if the mft record is not cached at this point in time, we need to wait
- * for it to be read in before we can do the copy.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_read_folio(struct file *file, struct folio *folio)
-{
-	struct page *page = &folio->page;
-	loff_t i_size;
-	struct inode *vi;
-	ntfs_inode *ni, *base_ni;
-	u8 *addr;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *mrec;
-	unsigned long flags;
-	u32 attr_len;
-	int err = 0;
-
-retry_readpage:
-	BUG_ON(!PageLocked(page));
-	vi = page->mapping->host;
-	i_size = i_size_read(vi);
-	/* Is the page fully outside i_size? (truncate in progress) */
-	if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
-			PAGE_SHIFT)) {
-		zero_user(page, 0, PAGE_SIZE);
-		ntfs_debug("Read outside i_size - truncated?");
-		goto done;
-	}
-	/*
-	 * This can potentially happen because we clear PageUptodate() during
-	 * ntfs_writepage() of MstProtected() attributes.
-	 */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		return 0;
-	}
-	ni = NTFS_I(vi);
-	/*
-	 * Only $DATA attributes can be encrypted and only unnamed $DATA
-	 * attributes can be compressed.  Index root can have the flags set but
-	 * this means to create compressed/encrypted files, not that the
-	 * attribute is compressed/encrypted.  Note we need to check for
-	 * AT_INDEX_ALLOCATION since this is the type of both directory and
-	 * index inodes.
-	 */
-	if (ni->type != AT_INDEX_ALLOCATION) {
-		/* If attribute is encrypted, deny access, just like NT4. */
-		if (NInoEncrypted(ni)) {
-			BUG_ON(ni->type != AT_DATA);
-			err = -EACCES;
-			goto err_out;
-		}
-		/* Compressed data streams are handled in compress.c. */
-		if (NInoNonResident(ni) && NInoCompressed(ni)) {
-			BUG_ON(ni->type != AT_DATA);
-			BUG_ON(ni->name_len);
-			return ntfs_read_compressed_block(page);
-		}
-	}
-	/* NInoNonResident() == NInoIndexAllocPresent() */
-	if (NInoNonResident(ni)) {
-		/* Normal, non-resident data stream. */
-		return ntfs_read_block(folio);
-	}
-	/*
-	 * Attribute is resident, implying it is not compressed or encrypted.
-	 * This also means the attribute is smaller than an mft record and
-	 * hence smaller than a page, so can simply zero out any pages with
-	 * index above 0.  Note the attribute can actually be marked compressed
-	 * but if it is resident the actual data is not compressed so we are
-	 * ok to ignore the compressed flag here.
-	 */
-	if (unlikely(page->index > 0)) {
-		zero_user(page, 0, PAGE_SIZE);
-		goto done;
-	}
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/* Map, pin, and lock the mft record. */
-	mrec = map_mft_record(base_ni);
-	if (IS_ERR(mrec)) {
-		err = PTR_ERR(mrec);
-		goto err_out;
-	}
-	/*
-	 * If a parallel write made the attribute non-resident, drop the mft
-	 * record and retry the read_folio.
-	 */
-	if (unlikely(NInoNonResident(ni))) {
-		unmap_mft_record(base_ni);
-		goto retry_readpage;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err))
-		goto put_unm_err_out;
-	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
-	read_lock_irqsave(&ni->size_lock, flags);
-	if (unlikely(attr_len > ni->initialized_size))
-		attr_len = ni->initialized_size;
-	i_size = i_size_read(vi);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (unlikely(attr_len > i_size)) {
-		/* Race with shrinking truncate. */
-		attr_len = i_size;
-	}
-	addr = kmap_atomic(page);
-	/* Copy the data to the page. */
-	memcpy(addr, (u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset),
-			attr_len);
-	/* Zero the remainder of the page. */
-	memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
-	flush_dcache_page(page);
-	kunmap_atomic(addr);
-put_unm_err_out:
-	ntfs_attr_put_search_ctx(ctx);
-unm_err_out:
-	unmap_mft_record(base_ni);
-done:
-	SetPageUptodate(page);
-err_out:
-	unlock_page(page);
-	return err;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_write_block - write a @folio to the backing store
- * @folio:	page cache folio to write out
- * @wbc:	writeback control structure
- *
- * This function is for writing folios belonging to non-resident, non-mst
- * protected attributes to their backing store.
- *
- * For a folio with buffers, map and write the dirty buffers asynchronously
- * under folio writeback. For a folio without buffers, create buffers for the
- * folio, then proceed as above.
- *
- * If a folio doesn't have buffers the folio dirty state is definitive. If
- * a folio does have buffers, the folio dirty state is just a hint,
- * and the buffer dirty state is definitive. (A hint which has rules:
- * dirty buffers against a clean folio is illegal. Other combinations are
- * legal and need to be handled. In particular a dirty folio containing
- * clean buffers for example.)
- *
- * Return 0 on success and -errno on error.
- *
- * Based on ntfs_read_block() and __block_write_full_folio().
- */
-static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc)
-{
-	VCN vcn;
-	LCN lcn;
-	s64 initialized_size;
-	loff_t i_size;
-	sector_t block, dblock, iblock;
-	struct inode *vi;
-	ntfs_inode *ni;
-	ntfs_volume *vol;
-	runlist_element *rl;
-	struct buffer_head *bh, *head;
-	unsigned long flags;
-	unsigned int blocksize, vcn_ofs;
-	int err;
-	bool need_end_writeback;
-	unsigned char blocksize_bits;
-
-	vi = folio->mapping->host;
-	ni = NTFS_I(vi);
-	vol = ni->vol;
-
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
-			"0x%lx.", ni->mft_no, ni->type, folio->index);
-
-	BUG_ON(!NInoNonResident(ni));
-	BUG_ON(NInoMstProtected(ni));
-	blocksize = vol->sb->s_blocksize;
-	blocksize_bits = vol->sb->s_blocksize_bits;
-	head = folio_buffers(folio);
-	if (!head) {
-		BUG_ON(!folio_test_uptodate(folio));
-		head = create_empty_buffers(folio, blocksize,
-				(1 << BH_Uptodate) | (1 << BH_Dirty));
-	}
-	bh = head;
-
-	/* NOTE: Different naming scheme to ntfs_read_block()! */
-
-	/* The first block in the folio. */
-	block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
-
-	read_lock_irqsave(&ni->size_lock, flags);
-	i_size = i_size_read(vi);
-	initialized_size = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-
-	/* The first out of bounds block for the data size. */
-	dblock = (i_size + blocksize - 1) >> blocksize_bits;
-
-	/* The last (fully or partially) initialized block. */
-	iblock = initialized_size >> blocksize_bits;
-
-	/*
-	 * Be very careful.  We have no exclusion from block_dirty_folio
-	 * here, and the (potentially unmapped) buffers may become dirty at
-	 * any time.  If a buffer becomes dirty here after we've inspected it
-	 * then we just miss that fact, and the folio stays dirty.
-	 *
-	 * Buffers outside i_size may be dirtied by block_dirty_folio;
-	 * handle that here by just cleaning them.
-	 */
-
-	/*
-	 * Loop through all the buffers in the folio, mapping all the dirty
-	 * buffers to disk addresses and handling any aliases from the
-	 * underlying block device's mapping.
-	 */
-	rl = NULL;
-	err = 0;
-	do {
-		bool is_retry = false;
-
-		if (unlikely(block >= dblock)) {
-			/*
-			 * Mapped buffers outside i_size will occur, because
-			 * this folio can be outside i_size when there is a
-			 * truncate in progress. The contents of such buffers
-			 * were zeroed by ntfs_writepage().
-			 *
-			 * FIXME: What about the small race window where
-			 * ntfs_writepage() has not done any clearing because
-			 * the folio was within i_size but before we get here,
-			 * vmtruncate() modifies i_size?
-			 */
-			clear_buffer_dirty(bh);
-			set_buffer_uptodate(bh);
-			continue;
-		}
-
-		/* Clean buffers are not written out, so no need to map them. */
-		if (!buffer_dirty(bh))
-			continue;
-
-		/* Make sure we have enough initialized size. */
-		if (unlikely((block >= iblock) &&
-				(initialized_size < i_size))) {
-			/*
-			 * If this folio is fully outside initialized
-			 * size, zero out all folios between the current
-			 * initialized size and the current folio. Just
-			 * use ntfs_read_folio() to do the zeroing
-			 * transparently.
-			 */
-			if (block > iblock) {
-				// TODO:
-				// For each folio do:
-				// - read_cache_folio()
-				// Again for each folio do:
-				// - wait_on_folio_locked()
-				// - Check (folio_test_uptodate(folio) &&
-				//		!folio_test_error(folio))
-				// Update initialized size in the attribute and
-				// in the inode.
-				// Again, for each folio do:
-				//	block_dirty_folio();
-				// folio_put()
-				// We don't need to wait on the writes.
-				// Update iblock.
-			}
-			/*
-			 * The current folio straddles initialized size. Zero
-			 * all non-uptodate buffers and set them uptodate (and
-			 * dirty?). Note, there aren't any non-uptodate buffers
-			 * if the folio is uptodate.
-			 * FIXME: For an uptodate folio, the buffers may need to
-			 * be written out because they were not initialized on
-			 * disk before.
-			 */
-			if (!folio_test_uptodate(folio)) {
-				// TODO:
-				// Zero any non-uptodate buffers up to i_size.
-				// Set them uptodate and dirty.
-			}
-			// TODO:
-			// Update initialized size in the attribute and in the
-			// inode (up to i_size).
-			// Update iblock.
-			// FIXME: This is inefficient. Try to batch the two
-			// size changes to happen in one go.
-			ntfs_error(vol->sb, "Writing beyond initialized size "
-					"is not supported yet. Sorry.");
-			err = -EOPNOTSUPP;
-			break;
-			// Do NOT set_buffer_new() BUT DO clear buffer range
-			// outside write request range.
-			// set_buffer_uptodate() on complete buffers as well as
-			// set_buffer_dirty().
-		}
-
-		/* No need to map buffers that are already mapped. */
-		if (buffer_mapped(bh))
-			continue;
-
-		/* Unmapped, dirty buffer. Need to map it. */
-		bh->b_bdev = vol->sb->s_bdev;
-
-		/* Convert block into corresponding vcn and offset. */
-		vcn = (VCN)block << blocksize_bits;
-		vcn_ofs = vcn & vol->cluster_size_mask;
-		vcn >>= vol->cluster_size_bits;
-		if (!rl) {
-lock_retry_remap:
-			down_read(&ni->runlist.lock);
-			rl = ni->runlist.rl;
-		}
-		if (likely(rl != NULL)) {
-			/* Seek to element containing target vcn. */
-			while (rl->length && rl[1].vcn <= vcn)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-		} else
-			lcn = LCN_RL_NOT_MAPPED;
-		/* Successful remap. */
-		if (lcn >= 0) {
-			/* Setup buffer head to point to correct block. */
-			bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
-					vcn_ofs) >> blocksize_bits;
-			set_buffer_mapped(bh);
-			continue;
-		}
-		/* It is a hole, need to instantiate it. */
-		if (lcn == LCN_HOLE) {
-			u8 *kaddr;
-			unsigned long *bpos, *bend;
-
-			/* Check if the buffer is zero. */
-			kaddr = kmap_local_folio(folio, bh_offset(bh));
-			bpos = (unsigned long *)kaddr;
-			bend = (unsigned long *)(kaddr + blocksize);
-			do {
-				if (unlikely(*bpos))
-					break;
-			} while (likely(++bpos < bend));
-			kunmap_local(kaddr);
-			if (bpos == bend) {
-				/*
-				 * Buffer is zero and sparse, no need to write
-				 * it.
-				 */
-				bh->b_blocknr = -1;
-				clear_buffer_dirty(bh);
-				continue;
-			}
-			// TODO: Instantiate the hole.
-			// clear_buffer_new(bh);
-			// clean_bdev_bh_alias(bh);
-			ntfs_error(vol->sb, "Writing into sparse regions is "
-					"not supported yet. Sorry.");
-			err = -EOPNOTSUPP;
-			break;
-		}
-		/* If first try and runlist unmapped, map and retry. */
-		if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-			is_retry = true;
-			/*
-			 * Attempt to map runlist, dropping lock for
-			 * the duration.
-			 */
-			up_read(&ni->runlist.lock);
-			err = ntfs_map_runlist(ni, vcn);
-			if (likely(!err))
-				goto lock_retry_remap;
-			rl = NULL;
-		} else if (!rl)
-			up_read(&ni->runlist.lock);
-		/*
-		 * If buffer is outside the runlist, truncate has cut it out
-		 * of the runlist.  Just clean and clear the buffer and set it
-		 * uptodate so it can get discarded by the VM.
-		 */
-		if (err == -ENOENT || lcn == LCN_ENOENT) {
-			bh->b_blocknr = -1;
-			clear_buffer_dirty(bh);
-			folio_zero_range(folio, bh_offset(bh), blocksize);
-			set_buffer_uptodate(bh);
-			err = 0;
-			continue;
-		}
-		/* Failed to map the buffer, even after retrying. */
-		if (!err)
-			err = -EIO;
-		bh->b_blocknr = -1;
-		ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
-				"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
-				"because its location on disk could not be "
-				"determined%s (error code %i).", ni->mft_no,
-				ni->type, (unsigned long long)vcn,
-				vcn_ofs, is_retry ? " even after "
-				"retrying" : "", err);
-		break;
-	} while (block++, (bh = bh->b_this_page) != head);
-
-	/* Release the lock if we took it. */
-	if (rl)
-		up_read(&ni->runlist.lock);
-
-	/* For the error case, need to reset bh to the beginning. */
-	bh = head;
-
-	/* Just an optimization, so ->read_folio() is not called later. */
-	if (unlikely(!folio_test_uptodate(folio))) {
-		int uptodate = 1;
-		do {
-			if (!buffer_uptodate(bh)) {
-				uptodate = 0;
-				bh = head;
-				break;
-			}
-		} while ((bh = bh->b_this_page) != head);
-		if (uptodate)
-			folio_mark_uptodate(folio);
-	}
-
-	/* Setup all mapped, dirty buffers for async write i/o. */
-	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
-			lock_buffer(bh);
-			if (test_clear_buffer_dirty(bh)) {
-				BUG_ON(!buffer_uptodate(bh));
-				mark_buffer_async_write(bh);
-			} else
-				unlock_buffer(bh);
-		} else if (unlikely(err)) {
-			/*
-			 * For the error case. The buffer may have been set
-			 * dirty during attachment to a dirty folio.
-			 */
-			if (err != -ENOMEM)
-				clear_buffer_dirty(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-
-	if (unlikely(err)) {
-		// TODO: Remove the -EOPNOTSUPP check later on...
-		if (unlikely(err == -EOPNOTSUPP))
-			err = 0;
-		else if (err == -ENOMEM) {
-			ntfs_warning(vol->sb, "Error allocating memory. "
-					"Redirtying folio so we try again "
-					"later.");
-			/*
-			 * Put the folio back on mapping->dirty_pages, but
-			 * leave its buffer's dirty state as-is.
-			 */
-			folio_redirty_for_writepage(wbc, folio);
-			err = 0;
-		} else
-			folio_set_error(folio);
-	}
-
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);	/* Keeps try_to_free_buffers() away. */
-
-	/* Submit the prepared buffers for i/o. */
-	need_end_writeback = true;
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			submit_bh(REQ_OP_WRITE, bh);
-			need_end_writeback = false;
-		}
-		bh = next;
-	} while (bh != head);
-	folio_unlock(folio);
-
-	/* If no i/o was started, need to end writeback here. */
-	if (unlikely(need_end_writeback))
-		folio_end_writeback(folio);
-
-	ntfs_debug("Done.");
-	return err;
-}
-
-/**
- * ntfs_write_mst_block - write a @page to the backing store
- * @page:	page cache page to write out
- * @wbc:	writeback control structure
- *
- * This function is for writing pages belonging to non-resident, mst protected
- * attributes to their backing store.  The only supported attributes are index
- * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
- * supported for the index allocation case.
- *
- * The page must remain locked for the duration of the write because we apply
- * the mst fixups, write, and then undo the fixups, so if we were to unlock the
- * page before undoing the fixups, any other user of the page will see the
- * page contents as corrupt.
- *
- * We clear the page uptodate flag for the duration of the function to ensure
- * exclusion for the $MFT/$DATA case against someone mapping an mft record we
- * are about to apply the mst fixups to.
- *
- * Return 0 on success and -errno on error.
- *
- * Based on ntfs_write_block(), ntfs_mft_writepage(), and
- * write_mft_record_nolock().
- */
-static int ntfs_write_mst_block(struct page *page,
-		struct writeback_control *wbc)
-{
-	sector_t block, dblock, rec_block;
-	struct inode *vi = page->mapping->host;
-	ntfs_inode *ni = NTFS_I(vi);
-	ntfs_volume *vol = ni->vol;
-	u8 *kaddr;
-	unsigned int rec_size = ni->itype.index.block_size;
-	ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
-	struct buffer_head *bh, *head, *tbh, *rec_start_bh;
-	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
-	runlist_element *rl;
-	int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
-	unsigned bh_size, rec_size_bits;
-	bool sync, is_mft, page_is_dirty, rec_is_dirty;
-	unsigned char bh_size_bits;
-
-	if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
-		return -EINVAL;
-
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
-			"0x%lx.", vi->i_ino, ni->type, page->index);
-	BUG_ON(!NInoNonResident(ni));
-	BUG_ON(!NInoMstProtected(ni));
-	is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
-	/*
-	 * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
-	 * in its page cache were to be marked dirty.  However this should
-	 * never happen with the current driver and considering we do not
-	 * handle this case here we do want to BUG(), at least for now.
-	 */
-	BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
-			(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
-	bh_size = vol->sb->s_blocksize;
-	bh_size_bits = vol->sb->s_blocksize_bits;
-	max_bhs = PAGE_SIZE / bh_size;
-	BUG_ON(!max_bhs);
-	BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
-
-	/* Were we called for sync purposes? */
-	sync = (wbc->sync_mode == WB_SYNC_ALL);
-
-	/* Make sure we have mapped buffers. */
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
-
-	rec_size_bits = ni->itype.index.block_size_bits;
-	BUG_ON(!(PAGE_SIZE >> rec_size_bits));
-	bhs_per_rec = rec_size >> bh_size_bits;
-	BUG_ON(!bhs_per_rec);
-
-	/* The first block in the page. */
-	rec_block = block = (sector_t)page->index <<
-			(PAGE_SHIFT - bh_size_bits);
-
-	/* The first out of bounds block for the data size. */
-	dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
-
-	rl = NULL;
-	err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
-	page_is_dirty = rec_is_dirty = false;
-	rec_start_bh = NULL;
-	do {
-		bool is_retry = false;
-
-		if (likely(block < rec_block)) {
-			if (unlikely(block >= dblock)) {
-				clear_buffer_dirty(bh);
-				set_buffer_uptodate(bh);
-				continue;
-			}
-			/*
-			 * This block is not the first one in the record.  We
-			 * ignore the buffer's dirty state because we could
-			 * have raced with a parallel mark_ntfs_record_dirty().
-			 */
-			if (!rec_is_dirty)
-				continue;
-			if (unlikely(err2)) {
-				if (err2 != -ENOMEM)
-					clear_buffer_dirty(bh);
-				continue;
-			}
-		} else /* if (block == rec_block) */ {
-			BUG_ON(block > rec_block);
-			/* This block is the first one in the record. */
-			rec_block += bhs_per_rec;
-			err2 = 0;
-			if (unlikely(block >= dblock)) {
-				clear_buffer_dirty(bh);
-				continue;
-			}
-			if (!buffer_dirty(bh)) {
-				/* Clean records are not written out. */
-				rec_is_dirty = false;
-				continue;
-			}
-			rec_is_dirty = true;
-			rec_start_bh = bh;
-		}
-		/* Need to map the buffer if it is not mapped already. */
-		if (unlikely(!buffer_mapped(bh))) {
-			VCN vcn;
-			LCN lcn;
-			unsigned int vcn_ofs;
-
-			bh->b_bdev = vol->sb->s_bdev;
-			/* Obtain the vcn and offset of the current block. */
-			vcn = (VCN)block << bh_size_bits;
-			vcn_ofs = vcn & vol->cluster_size_mask;
-			vcn >>= vol->cluster_size_bits;
-			if (!rl) {
-lock_retry_remap:
-				down_read(&ni->runlist.lock);
-				rl = ni->runlist.rl;
-			}
-			if (likely(rl != NULL)) {
-				/* Seek to element containing target vcn. */
-				while (rl->length && rl[1].vcn <= vcn)
-					rl++;
-				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-			} else
-				lcn = LCN_RL_NOT_MAPPED;
-			/* Successful remap. */
-			if (likely(lcn >= 0)) {
-				/* Setup buffer head to correct block. */
-				bh->b_blocknr = ((lcn <<
-						vol->cluster_size_bits) +
-						vcn_ofs) >> bh_size_bits;
-				set_buffer_mapped(bh);
-			} else {
-				/*
-				 * Remap failed.  Retry to map the runlist once
-				 * unless we are working on $MFT which always
-				 * has the whole of its runlist in memory.
-				 */
-				if (!is_mft && !is_retry &&
-						lcn == LCN_RL_NOT_MAPPED) {
-					is_retry = true;
-					/*
-					 * Attempt to map runlist, dropping
-					 * lock for the duration.
-					 */
-					up_read(&ni->runlist.lock);
-					err2 = ntfs_map_runlist(ni, vcn);
-					if (likely(!err2))
-						goto lock_retry_remap;
-					if (err2 == -ENOMEM)
-						page_is_dirty = true;
-					lcn = err2;
-				} else {
-					err2 = -EIO;
-					if (!rl)
-						up_read(&ni->runlist.lock);
-				}
-				/* Hard error.  Abort writing this record. */
-				if (!err || err == -ENOMEM)
-					err = err2;
-				bh->b_blocknr = -1;
-				ntfs_error(vol->sb, "Cannot write ntfs record "
-						"0x%llx (inode 0x%lx, "
-						"attribute type 0x%x) because "
-						"its location on disk could "
-						"not be determined (error "
-						"code %lli).",
-						(long long)block <<
-						bh_size_bits >>
-						vol->mft_record_size_bits,
-						ni->mft_no, ni->type,
-						(long long)lcn);
-				/*
-				 * If this is not the first buffer, remove the
-				 * buffers in this record from the list of
-				 * buffers to write and clear their dirty bit
-				 * if not error -ENOMEM.
-				 */
-				if (rec_start_bh != bh) {
-					while (bhs[--nr_bhs] != rec_start_bh)
-						;
-					if (err2 != -ENOMEM) {
-						do {
-							clear_buffer_dirty(
-								rec_start_bh);
-						} while ((rec_start_bh =
-								rec_start_bh->
-								b_this_page) !=
-								bh);
-					}
-				}
-				continue;
-			}
-		}
-		BUG_ON(!buffer_uptodate(bh));
-		BUG_ON(nr_bhs >= max_bhs);
-		bhs[nr_bhs++] = bh;
-	} while (block++, (bh = bh->b_this_page) != head);
-	if (unlikely(rl))
-		up_read(&ni->runlist.lock);
-	/* If there were no dirty buffers, we are done. */
-	if (!nr_bhs)
-		goto done;
-	/* Map the page so we can access its contents. */
-	kaddr = kmap(page);
-	/* Clear the page uptodate flag whilst the mst fixups are applied. */
-	BUG_ON(!PageUptodate(page));
-	ClearPageUptodate(page);
-	for (i = 0; i < nr_bhs; i++) {
-		unsigned int ofs;
-
-		/* Skip buffers which are not at the beginning of records. */
-		if (i % bhs_per_rec)
-			continue;
-		tbh = bhs[i];
-		ofs = bh_offset(tbh);
-		if (is_mft) {
-			ntfs_inode *tni;
-			unsigned long mft_no;
-
-			/* Get the mft record number. */
-			mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
-					>> rec_size_bits;
-			/* Check whether to write this mft record. */
-			tni = NULL;
-			if (!ntfs_may_write_mft_record(vol, mft_no,
-					(MFT_RECORD*)(kaddr + ofs), &tni)) {
-				/*
-				 * The record should not be written.  This
-				 * means we need to redirty the page before
-				 * returning.
-				 */
-				page_is_dirty = true;
-				/*
-				 * Remove the buffers in this mft record from
-				 * the list of buffers to write.
-				 */
-				do {
-					bhs[i] = NULL;
-				} while (++i % bhs_per_rec);
-				continue;
-			}
-			/*
-			 * The record should be written.  If a locked ntfs
-			 * inode was returned, add it to the array of locked
-			 * ntfs inodes.
-			 */
-			if (tni)
-				locked_nis[nr_locked_nis++] = tni;
-		}
-		/* Apply the mst protection fixups. */
-		err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
-				rec_size);
-		if (unlikely(err2)) {
-			if (!err || err == -ENOMEM)
-				err = -EIO;
-			ntfs_error(vol->sb, "Failed to apply mst fixups "
-					"(inode 0x%lx, attribute type 0x%x, "
-					"page index 0x%lx, page offset 0x%x)!"
-					"  Unmount and run chkdsk.", vi->i_ino,
-					ni->type, page->index, ofs);
-			/*
-			 * Mark all the buffers in this record clean as we do
-			 * not want to write corrupt data to disk.
-			 */
-			do {
-				clear_buffer_dirty(bhs[i]);
-				bhs[i] = NULL;
-			} while (++i % bhs_per_rec);
-			continue;
-		}
-		nr_recs++;
-	}
-	/* If no records are to be written out, we are done. */
-	if (!nr_recs)
-		goto unm_done;
-	flush_dcache_page(page);
-	/* Lock buffers and start synchronous write i/o on them. */
-	for (i = 0; i < nr_bhs; i++) {
-		tbh = bhs[i];
-		if (!tbh)
-			continue;
-		if (!trylock_buffer(tbh))
-			BUG();
-		/* The buffer dirty state is now irrelevant, just clean it. */
-		clear_buffer_dirty(tbh);
-		BUG_ON(!buffer_uptodate(tbh));
-		BUG_ON(!buffer_mapped(tbh));
-		get_bh(tbh);
-		tbh->b_end_io = end_buffer_write_sync;
-		submit_bh(REQ_OP_WRITE, tbh);
-	}
-	/* Synchronize the mft mirror now if not @sync. */
-	if (is_mft && !sync)
-		goto do_mirror;
-do_wait:
-	/* Wait on i/o completion of buffers. */
-	for (i = 0; i < nr_bhs; i++) {
-		tbh = bhs[i];
-		if (!tbh)
-			continue;
-		wait_on_buffer(tbh);
-		if (unlikely(!buffer_uptodate(tbh))) {
-			ntfs_error(vol->sb, "I/O error while writing ntfs "
-					"record buffer (inode 0x%lx, "
-					"attribute type 0x%x, page index "
-					"0x%lx, page offset 0x%lx)!  Unmount "
-					"and run chkdsk.", vi->i_ino, ni->type,
-					page->index, bh_offset(tbh));
-			if (!err || err == -ENOMEM)
-				err = -EIO;
-			/*
-			 * Set the buffer uptodate so the page and buffer
-			 * states do not become out of sync.
-			 */
-			set_buffer_uptodate(tbh);
-		}
-	}
-	/* If @sync, now synchronize the mft mirror. */
-	if (is_mft && sync) {
-do_mirror:
-		for (i = 0; i < nr_bhs; i++) {
-			unsigned long mft_no;
-			unsigned int ofs;
-
-			/*
-			 * Skip buffers which are not at the beginning of
-			 * records.
-			 */
-			if (i % bhs_per_rec)
-				continue;
-			tbh = bhs[i];
-			/* Skip removed buffers (and hence records). */
-			if (!tbh)
-				continue;
-			ofs = bh_offset(tbh);
-			/* Get the mft record number. */
-			mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
-					>> rec_size_bits;
-			if (mft_no < vol->mftmirr_size)
-				ntfs_sync_mft_mirror(vol, mft_no,
-						(MFT_RECORD*)(kaddr + ofs),
-						sync);
-		}
-		if (!sync)
-			goto do_wait;
-	}
-	/* Remove the mst protection fixups again. */
-	for (i = 0; i < nr_bhs; i++) {
-		if (!(i % bhs_per_rec)) {
-			tbh = bhs[i];
-			if (!tbh)
-				continue;
-			post_write_mst_fixup((NTFS_RECORD*)(kaddr +
-					bh_offset(tbh)));
-		}
-	}
-	flush_dcache_page(page);
-unm_done:
-	/* Unlock any locked inodes. */
-	while (nr_locked_nis-- > 0) {
-		ntfs_inode *tni, *base_tni;
-		
-		tni = locked_nis[nr_locked_nis];
-		/* Get the base inode. */
-		mutex_lock(&tni->extent_lock);
-		if (tni->nr_extents >= 0)
-			base_tni = tni;
-		else {
-			base_tni = tni->ext.base_ntfs_ino;
-			BUG_ON(!base_tni);
-		}
-		mutex_unlock(&tni->extent_lock);
-		ntfs_debug("Unlocking %s inode 0x%lx.",
-				tni == base_tni ? "base" : "extent",
-				tni->mft_no);
-		mutex_unlock(&tni->mrec_lock);
-		atomic_dec(&tni->count);
-		iput(VFS_I(base_tni));
-	}
-	SetPageUptodate(page);
-	kunmap(page);
-done:
-	if (unlikely(err && err != -ENOMEM)) {
-		/*
-		 * Set page error if there is only one ntfs record in the page.
-		 * Otherwise we would loose per-record granularity.
-		 */
-		if (ni->itype.index.block_size == PAGE_SIZE)
-			SetPageError(page);
-		NVolSetErrors(vol);
-	}
-	if (page_is_dirty) {
-		ntfs_debug("Page still contains one or more dirty ntfs "
-				"records.  Redirtying the page starting at "
-				"record 0x%lx.", page->index <<
-				(PAGE_SHIFT - rec_size_bits));
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-	} else {
-		/*
-		 * Keep the VM happy.  This must be done otherwise the
-		 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
-		 * the page is clean.
-		 */
-		BUG_ON(PageWriteback(page));
-		set_page_writeback(page);
-		unlock_page(page);
-		end_page_writeback(page);
-	}
-	if (likely(!err))
-		ntfs_debug("Done.");
-	return err;
-}
-
-/**
- * ntfs_writepage - write a @page to the backing store
- * @page:	page cache page to write out
- * @wbc:	writeback control structure
- *
- * This is called from the VM when it wants to have a dirty ntfs page cache
- * page cleaned.  The VM has already locked the page and marked it clean.
- *
- * For non-resident attributes, ntfs_writepage() writes the @page by calling
- * the ntfs version of the generic block_write_full_folio() function,
- * ntfs_write_block(), which in turn if necessary creates and writes the
- * buffers associated with the page asynchronously.
- *
- * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
- * the data to the mft record (which at this stage is most likely in memory).
- * The mft record is then marked dirty and written out asynchronously via the
- * vfs inode dirty code path for the inode the mft record belongs to or via the
- * vm page dirty code path for the page the mft record is in.
- *
- * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio().
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct folio *folio = page_folio(page);
-	loff_t i_size;
-	struct inode *vi = folio->mapping->host;
-	ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
-	char *addr;
-	ntfs_attr_search_ctx *ctx = NULL;
-	MFT_RECORD *m = NULL;
-	u32 attr_len;
-	int err;
-
-retry_writepage:
-	BUG_ON(!folio_test_locked(folio));
-	i_size = i_size_read(vi);
-	/* Is the folio fully outside i_size? (truncate in progress) */
-	if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
-			PAGE_SHIFT)) {
-		/*
-		 * The folio may have dirty, unmapped buffers.  Make them
-		 * freeable here, so the page does not leak.
-		 */
-		block_invalidate_folio(folio, 0, folio_size(folio));
-		folio_unlock(folio);
-		ntfs_debug("Write outside i_size - truncated?");
-		return 0;
-	}
-	/*
-	 * Only $DATA attributes can be encrypted and only unnamed $DATA
-	 * attributes can be compressed.  Index root can have the flags set but
-	 * this means to create compressed/encrypted files, not that the
-	 * attribute is compressed/encrypted.  Note we need to check for
-	 * AT_INDEX_ALLOCATION since this is the type of both directory and
-	 * index inodes.
-	 */
-	if (ni->type != AT_INDEX_ALLOCATION) {
-		/* If file is encrypted, deny access, just like NT4. */
-		if (NInoEncrypted(ni)) {
-			folio_unlock(folio);
-			BUG_ON(ni->type != AT_DATA);
-			ntfs_debug("Denying write access to encrypted file.");
-			return -EACCES;
-		}
-		/* Compressed data streams are handled in compress.c. */
-		if (NInoNonResident(ni) && NInoCompressed(ni)) {
-			BUG_ON(ni->type != AT_DATA);
-			BUG_ON(ni->name_len);
-			// TODO: Implement and replace this with
-			// return ntfs_write_compressed_block(page);
-			folio_unlock(folio);
-			ntfs_error(vi->i_sb, "Writing to compressed files is "
-					"not supported yet.  Sorry.");
-			return -EOPNOTSUPP;
-		}
-		// TODO: Implement and remove this check.
-		if (NInoNonResident(ni) && NInoSparse(ni)) {
-			folio_unlock(folio);
-			ntfs_error(vi->i_sb, "Writing to sparse files is not "
-					"supported yet.  Sorry.");
-			return -EOPNOTSUPP;
-		}
-	}
-	/* NInoNonResident() == NInoIndexAllocPresent() */
-	if (NInoNonResident(ni)) {
-		/* We have to zero every time due to mmap-at-end-of-file. */
-		if (folio->index >= (i_size >> PAGE_SHIFT)) {
-			/* The folio straddles i_size. */
-			unsigned int ofs = i_size & (folio_size(folio) - 1);
-			folio_zero_segment(folio, ofs, folio_size(folio));
-		}
-		/* Handle mst protected attributes. */
-		if (NInoMstProtected(ni))
-			return ntfs_write_mst_block(page, wbc);
-		/* Normal, non-resident data stream. */
-		return ntfs_write_block(folio, wbc);
-	}
-	/*
-	 * Attribute is resident, implying it is not compressed, encrypted, or
-	 * mst protected.  This also means the attribute is smaller than an mft
-	 * record and hence smaller than a folio, so can simply return error on
-	 * any folios with index above 0.  Note the attribute can actually be
-	 * marked compressed but if it is resident the actual data is not
-	 * compressed so we are ok to ignore the compressed flag here.
-	 */
-	BUG_ON(folio_buffers(folio));
-	BUG_ON(!folio_test_uptodate(folio));
-	if (unlikely(folio->index > 0)) {
-		ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0.  "
-				"Aborting write.", folio->index);
-		BUG_ON(folio_test_writeback(folio));
-		folio_start_writeback(folio);
-		folio_unlock(folio);
-		folio_end_writeback(folio);
-		return -EIO;
-	}
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/* Map, pin, and lock the mft record. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	/*
-	 * If a parallel write made the attribute non-resident, drop the mft
-	 * record and retry the writepage.
-	 */
-	if (unlikely(NInoNonResident(ni))) {
-		unmap_mft_record(base_ni);
-		goto retry_writepage;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err))
-		goto err_out;
-	/*
-	 * Keep the VM happy.  This must be done otherwise
-	 * PAGECACHE_TAG_DIRTY remains set even though the folio is clean.
-	 */
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);
-	folio_unlock(folio);
-	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
-	i_size = i_size_read(vi);
-	if (unlikely(attr_len > i_size)) {
-		/* Race with shrinking truncate or a failed truncate. */
-		attr_len = i_size;
-		/*
-		 * If the truncate failed, fix it up now.  If a concurrent
-		 * truncate, we do its job, so it does not have to do anything.
-		 */
-		err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
-				attr_len);
-		/* Shrinking cannot fail. */
-		BUG_ON(err);
-	}
-	addr = kmap_local_folio(folio, 0);
-	/* Copy the data from the folio to the mft record. */
-	memcpy((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset),
-			addr, attr_len);
-	/* Zero out of bounds area in the page cache folio. */
-	memset(addr + attr_len, 0, folio_size(folio) - attr_len);
-	kunmap_local(addr);
-	flush_dcache_folio(folio);
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	/* We are done with the folio. */
-	folio_end_writeback(folio);
-	/* Finally, mark the mft record dirty, so it gets written back. */
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	return 0;
-err_out:
-	if (err == -ENOMEM) {
-		ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
-				"page so we try again later.");
-		/*
-		 * Put the folio back on mapping->dirty_pages, but leave its
-		 * buffers' dirty state as-is.
-		 */
-		folio_redirty_for_writepage(wbc, folio);
-		err = 0;
-	} else {
-		ntfs_error(vi->i_sb, "Resident attribute write failed with "
-				"error %i.", err);
-		folio_set_error(folio);
-		NVolSetErrors(ni->vol);
-	}
-	folio_unlock(folio);
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	return err;
-}
-
-#endif	/* NTFS_RW */
-
-/**
- * ntfs_bmap - map logical file block to physical device block
- * @mapping:	address space mapping to which the block to be mapped belongs
- * @block:	logical block to map to its physical device block
- *
- * For regular, non-resident files (i.e. not compressed and not encrypted), map
- * the logical @block belonging to the file described by the address space
- * mapping @mapping to its physical device block.
- *
- * The size of the block is equal to the @s_blocksize field of the super block
- * of the mounted file system which is guaranteed to be smaller than or equal
- * to the cluster size thus the block is guaranteed to fit entirely inside the
- * cluster which means we do not need to care how many contiguous bytes are
- * available after the beginning of the block.
- *
- * Return the physical device block if the mapping succeeded or 0 if the block
- * is sparse or there was an error.
- *
- * Note: This is a problem if someone tries to run bmap() on $Boot system file
- * as that really is in block zero but there is nothing we can do.  bmap() is
- * just broken in that respect (just like it cannot distinguish sparse from
- * not available or error).
- */
-static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
-{
-	s64 ofs, size;
-	loff_t i_size;
-	LCN lcn;
-	unsigned long blocksize, flags;
-	ntfs_inode *ni = NTFS_I(mapping->host);
-	ntfs_volume *vol = ni->vol;
-	unsigned delta;
-	unsigned char blocksize_bits, cluster_size_shift;
-
-	ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
-			ni->mft_no, (unsigned long long)block);
-	if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
-		ntfs_error(vol->sb, "BMAP does not make sense for %s "
-				"attributes, returning 0.",
-				(ni->type != AT_DATA) ? "non-data" :
-				(!NInoNonResident(ni) ? "resident" :
-				"encrypted"));
-		return 0;
-	}
-	/* None of these can happen. */
-	BUG_ON(NInoCompressed(ni));
-	BUG_ON(NInoMstProtected(ni));
-	blocksize = vol->sb->s_blocksize;
-	blocksize_bits = vol->sb->s_blocksize_bits;
-	ofs = (s64)block << blocksize_bits;
-	read_lock_irqsave(&ni->size_lock, flags);
-	size = ni->initialized_size;
-	i_size = i_size_read(VFS_I(ni));
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * If the offset is outside the initialized size or the block straddles
-	 * the initialized size then pretend it is a hole unless the
-	 * initialized size equals the file size.
-	 */
-	if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
-		goto hole;
-	cluster_size_shift = vol->cluster_size_bits;
-	down_read(&ni->runlist.lock);
-	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
-	up_read(&ni->runlist.lock);
-	if (unlikely(lcn < LCN_HOLE)) {
-		/*
-		 * Step down to an integer to avoid gcc doing a long long
-		 * comparision in the switch when we know @lcn is between
-		 * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
-		 *
-		 * Otherwise older gcc (at least on some architectures) will
-		 * try to use __cmpdi2() which is of course not available in
-		 * the kernel.
-		 */
-		switch ((int)lcn) {
-		case LCN_ENOENT:
-			/*
-			 * If the offset is out of bounds then pretend it is a
-			 * hole.
-			 */
-			goto hole;
-		case LCN_ENOMEM:
-			ntfs_error(vol->sb, "Not enough memory to complete "
-					"mapping for inode 0x%lx.  "
-					"Returning 0.", ni->mft_no);
-			break;
-		default:
-			ntfs_error(vol->sb, "Failed to complete mapping for "
-					"inode 0x%lx.  Run chkdsk.  "
-					"Returning 0.", ni->mft_no);
-			break;
-		}
-		return 0;
-	}
-	if (lcn < 0) {
-		/* It is a hole. */
-hole:
-		ntfs_debug("Done (returning hole).");
-		return 0;
-	}
-	/*
-	 * The block is really allocated and fullfils all our criteria.
-	 * Convert the cluster to units of block size and return the result.
-	 */
-	delta = ofs & vol->cluster_size_mask;
-	if (unlikely(sizeof(block) < sizeof(lcn))) {
-		block = lcn = ((lcn << cluster_size_shift) + delta) >>
-				blocksize_bits;
-		/* If the block number was truncated return 0. */
-		if (unlikely(block != lcn)) {
-			ntfs_error(vol->sb, "Physical block 0x%llx is too "
-					"large to be returned, returning 0.",
-					(long long)lcn);
-			return 0;
-		}
-	} else
-		block = ((lcn << cluster_size_shift) + delta) >>
-				blocksize_bits;
-	ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
-	return block;
-}
-
-/*
- * ntfs_normal_aops - address space operations for normal inodes and attributes
- *
- * Note these are not used for compressed or mst protected inodes and
- * attributes.
- */
-const struct address_space_operations ntfs_normal_aops = {
-	.read_folio	= ntfs_read_folio,
-#ifdef NTFS_RW
-	.writepage	= ntfs_writepage,
-	.dirty_folio	= block_dirty_folio,
-#endif /* NTFS_RW */
-	.bmap		= ntfs_bmap,
-	.migrate_folio	= buffer_migrate_folio,
-	.is_partially_uptodate = block_is_partially_uptodate,
-	.error_remove_folio = generic_error_remove_folio,
-};
-
-/*
- * ntfs_compressed_aops - address space operations for compressed inodes
- */
-const struct address_space_operations ntfs_compressed_aops = {
-	.read_folio	= ntfs_read_folio,
-#ifdef NTFS_RW
-	.writepage	= ntfs_writepage,
-	.dirty_folio	= block_dirty_folio,
-#endif /* NTFS_RW */
-	.migrate_folio	= buffer_migrate_folio,
-	.is_partially_uptodate = block_is_partially_uptodate,
-	.error_remove_folio = generic_error_remove_folio,
-};
-
-/*
- * ntfs_mst_aops - general address space operations for mst protecteed inodes
- *			  and attributes
- */
-const struct address_space_operations ntfs_mst_aops = {
-	.read_folio	= ntfs_read_folio,	/* Fill page with data. */
-#ifdef NTFS_RW
-	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
-	.dirty_folio	= filemap_dirty_folio,
-#endif /* NTFS_RW */
-	.migrate_folio	= buffer_migrate_folio,
-	.is_partially_uptodate	= block_is_partially_uptodate,
-	.error_remove_folio = generic_error_remove_folio,
-};
-
-#ifdef NTFS_RW
-
-/**
- * mark_ntfs_record_dirty - mark an ntfs record dirty
- * @page:	page containing the ntfs record to mark dirty
- * @ofs:	byte offset within @page at which the ntfs record begins
- *
- * Set the buffers and the page in which the ntfs record is located dirty.
- *
- * The latter also marks the vfs inode the ntfs record belongs to dirty
- * (I_DIRTY_PAGES only).
- *
- * If the page does not have buffers, we create them and set them uptodate.
- * The page may not be locked which is why we need to handle the buffers under
- * the mapping->i_private_lock.  Once the buffers are marked dirty we no longer
- * need the lock since try_to_free_buffers() does not free dirty buffers.
- */
-void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
-	struct address_space *mapping = page->mapping;
-	ntfs_inode *ni = NTFS_I(mapping->host);
-	struct buffer_head *bh, *head, *buffers_to_free = NULL;
-	unsigned int end, bh_size, bh_ofs;
-
-	BUG_ON(!PageUptodate(page));
-	end = ofs + ni->itype.index.block_size;
-	bh_size = VFS_I(ni)->i_sb->s_blocksize;
-	spin_lock(&mapping->i_private_lock);
-	if (unlikely(!page_has_buffers(page))) {
-		spin_unlock(&mapping->i_private_lock);
-		bh = head = alloc_page_buffers(page, bh_size, true);
-		spin_lock(&mapping->i_private_lock);
-		if (likely(!page_has_buffers(page))) {
-			struct buffer_head *tail;
-
-			do {
-				set_buffer_uptodate(bh);
-				tail = bh;
-				bh = bh->b_this_page;
-			} while (bh);
-			tail->b_this_page = head;
-			attach_page_private(page, head);
-		} else
-			buffers_to_free = bh;
-	}
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
-	do {
-		bh_ofs = bh_offset(bh);
-		if (bh_ofs + bh_size <= ofs)
-			continue;
-		if (unlikely(bh_ofs >= end))
-			break;
-		set_buffer_dirty(bh);
-	} while ((bh = bh->b_this_page) != head);
-	spin_unlock(&mapping->i_private_lock);
-	filemap_dirty_folio(mapping, page_folio(page));
-	if (unlikely(buffers_to_free)) {
-		do {
-			bh = buffers_to_free->b_this_page;
-			free_buffer_head(buffers_to_free);
-			buffers_to_free = bh;
-		} while (buffers_to_free);
-	}
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
deleted file mode 100644
index 8d0958a149cb..000000000000
--- a/fs/ntfs/aops.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * aops.h - Defines for NTFS kernel address space operations and page cache
- *	    handling.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_AOPS_H
-#define _LINUX_NTFS_AOPS_H
-
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/fs.h>
-
-#include "inode.h"
-
-/**
- * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
- * @page:	the page to release
- *
- * Unpin, unmap and release a page that was obtained from ntfs_map_page().
- */
-static inline void ntfs_unmap_page(struct page *page)
-{
-	kunmap(page);
-	put_page(page);
-}
-
-/**
- * ntfs_map_page - map a page into accessible memory, reading it if necessary
- * @mapping:	address space for which to obtain the page
- * @index:	index into the page cache for @mapping of the page to map
- *
- * Read a page from the page cache of the address space @mapping at position
- * @index, where @index is in units of PAGE_SIZE, and not in bytes.
- *
- * If the page is not in memory it is loaded from disk first using the
- * read_folio method defined in the address space operations of @mapping
- * and the page is added to the page cache of @mapping in the process.
- *
- * If the page belongs to an mst protected attribute and it is marked as such
- * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
- * error checking is performed.  This means the caller has to verify whether
- * the ntfs record(s) contained in the page are valid or not using one of the
- * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are
- * expecting to see.  (For details of the macros, see fs/ntfs/layout.h.)
- *
- * If the page is in high memory it is mapped into memory directly addressible
- * by the kernel.
- *
- * Finally the page count is incremented, thus pinning the page into place.
- *
- * The above means that page_address(page) can be used on all pages obtained
- * with ntfs_map_page() to get the kernel virtual address of the page.
- *
- * When finished with the page, the caller has to call ntfs_unmap_page() to
- * unpin, unmap and release the page.
- *
- * Note this does not grant exclusive access. If such is desired, the caller
- * must provide it independently of the ntfs_{un}map_page() calls by using
- * a {rw_}semaphore or other means of serialization. A spin lock cannot be
- * used as ntfs_map_page() can block.
- *
- * The unlocked and uptodate page is returned on success or an encoded error
- * on failure. Caller has to test for error using the IS_ERR() macro on the
- * return value. If that evaluates to 'true', the negative error code can be
- * obtained using PTR_ERR() on the return value of ntfs_map_page().
- */
-static inline struct page *ntfs_map_page(struct address_space *mapping,
-		unsigned long index)
-{
-	struct page *page = read_mapping_page(mapping, index, NULL);
-
-	if (!IS_ERR(page))
-		kmap(page);
-	return page;
-}
-
-#ifdef NTFS_RW
-
-extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
deleted file mode 100644
index f79408f9127a..000000000000
--- a/fs/ntfs/attrib.c
+++ /dev/null
@@ -1,2624 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/writeback.h>
-
-#include "attrib.h"
-#include "debug.h"
-#include "layout.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-#include "types.h"
-
-/**
- * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
- * @ni:		ntfs inode for which to map (part of) a runlist
- * @vcn:	map runlist part containing this vcn
- * @ctx:	active attribute search context if present or NULL if not
- *
- * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record.  This is needed when ntfs_map_runlist_nolock() encounters unmapped
- * runlist fragments and allows their mapping.  If you do not have the mft
- * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
- * will perform the necessary mapping and unmapping.
- *
- * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
- * restores it before returning.  Thus, @ctx will be left pointing to the same
- * attribute on return as on entry.  However, the actual pointers in @ctx may
- * point to different memory locations on return, so you must remember to reset
- * any cached pointers from the @ctx, i.e. after the call to
- * ntfs_map_runlist_nolock(), you will probably want to do:
- *	m = ctx->mrec;
- *	a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * Return 0 on success and -errno on error.  There is one special error code
- * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
- * of bounds of the runlist.
- *
- * Note the runlist can be NULL after this function returns if @vcn is zero and
- * the attribute has zero allocated size, i.e. there simply is no runlist.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- *	    is no longer valid, i.e. you need to either call
- *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
- *	    why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- *	      and is locked on return.  Note the runlist will be modified.
- *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
- *	      entry and it will be left unmapped on return.
- *	    - If @ctx is not NULL, the base mft record must be mapped on entry
- *	      and it will be left mapped on return.
- */
-int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
-{
-	VCN end_vcn;
-	unsigned long flags;
-	ntfs_inode *base_ni;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	runlist_element *rl;
-	struct page *put_this_page = NULL;
-	int err = 0;
-	bool ctx_is_temporary, ctx_needs_reset;
-	ntfs_attr_search_ctx old_ctx = { NULL, };
-
-	ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
-			(unsigned long long)vcn);
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	if (!ctx) {
-		ctx_is_temporary = ctx_needs_reset = true;
-		m = map_mft_record(base_ni);
-		if (IS_ERR(m))
-			return PTR_ERR(m);
-		ctx = ntfs_attr_get_search_ctx(base_ni, m);
-		if (unlikely(!ctx)) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-	} else {
-		VCN allocated_size_vcn;
-
-		BUG_ON(IS_ERR(ctx->mrec));
-		a = ctx->attr;
-		BUG_ON(!a->non_resident);
-		ctx_is_temporary = false;
-		end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
-		read_lock_irqsave(&ni->size_lock, flags);
-		allocated_size_vcn = ni->allocated_size >>
-				ni->vol->cluster_size_bits;
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
-			end_vcn = allocated_size_vcn - 1;
-		/*
-		 * If we already have the attribute extent containing @vcn in
-		 * @ctx, no need to look it up again.  We slightly cheat in
-		 * that if vcn exceeds the allocated size, we will refuse to
-		 * map the runlist below, so there is definitely no need to get
-		 * the right attribute extent.
-		 */
-		if (vcn >= allocated_size_vcn || (a->type == ni->type &&
-				a->name_length == ni->name_len &&
-				!memcmp((u8*)a + le16_to_cpu(a->name_offset),
-				ni->name, ni->name_len) &&
-				sle64_to_cpu(a->data.non_resident.lowest_vcn)
-				<= vcn && end_vcn >= vcn))
-			ctx_needs_reset = false;
-		else {
-			/* Save the old search context. */
-			old_ctx = *ctx;
-			/*
-			 * If the currently mapped (extent) inode is not the
-			 * base inode we will unmap it when we reinitialize the
-			 * search context which means we need to get a
-			 * reference to the page containing the mapped mft
-			 * record so we do not accidentally drop changes to the
-			 * mft record when it has not been marked dirty yet.
-			 */
-			if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
-					old_ctx.base_ntfs_ino) {
-				put_this_page = old_ctx.ntfs_ino->page;
-				get_page(put_this_page);
-			}
-			/*
-			 * Reinitialize the search context so we can lookup the
-			 * needed attribute extent.
-			 */
-			ntfs_attr_reinit_search_ctx(ctx);
-			ctx_needs_reset = true;
-		}
-	}
-	if (ctx_needs_reset) {
-		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, vcn, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT)
-				err = -EIO;
-			goto err_out;
-		}
-		BUG_ON(!ctx->attr->non_resident);
-	}
-	a = ctx->attr;
-	/*
-	 * Only decompress the mapping pairs if @vcn is inside it.  Otherwise
-	 * we get into problems when we try to map an out of bounds vcn because
-	 * we then try to map the already mapped runlist fragment and
-	 * ntfs_mapping_pairs_decompress() fails.
-	 */
-	end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
-	if (unlikely(vcn && vcn >= end_vcn)) {
-		err = -ENOENT;
-		goto err_out;
-	}
-	rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl);
-	if (IS_ERR(rl))
-		err = PTR_ERR(rl);
-	else
-		ni->runlist.rl = rl;
-err_out:
-	if (ctx_is_temporary) {
-		if (likely(ctx))
-			ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-	} else if (ctx_needs_reset) {
-		/*
-		 * If there is no attribute list, restoring the search context
-		 * is accomplished simply by copying the saved context back over
-		 * the caller supplied context.  If there is an attribute list,
-		 * things are more complicated as we need to deal with mapping
-		 * of mft records and resulting potential changes in pointers.
-		 */
-		if (NInoAttrList(base_ni)) {
-			/*
-			 * If the currently mapped (extent) inode is not the
-			 * one we had before, we need to unmap it and map the
-			 * old one.
-			 */
-			if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
-				/*
-				 * If the currently mapped inode is not the
-				 * base inode, unmap it.
-				 */
-				if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
-						ctx->base_ntfs_ino) {
-					unmap_extent_mft_record(ctx->ntfs_ino);
-					ctx->mrec = ctx->base_mrec;
-					BUG_ON(!ctx->mrec);
-				}
-				/*
-				 * If the old mapped inode is not the base
-				 * inode, map it.
-				 */
-				if (old_ctx.base_ntfs_ino &&
-						old_ctx.ntfs_ino !=
-						old_ctx.base_ntfs_ino) {
-retry_map:
-					ctx->mrec = map_mft_record(
-							old_ctx.ntfs_ino);
-					/*
-					 * Something bad has happened.  If out
-					 * of memory retry till it succeeds.
-					 * Any other errors are fatal and we
-					 * return the error code in ctx->mrec.
-					 * Let the caller deal with it...  We
-					 * just need to fudge things so the
-					 * caller can reinit and/or put the
-					 * search context safely.
-					 */
-					if (IS_ERR(ctx->mrec)) {
-						if (PTR_ERR(ctx->mrec) ==
-								-ENOMEM) {
-							schedule();
-							goto retry_map;
-						} else
-							old_ctx.ntfs_ino =
-								old_ctx.
-								base_ntfs_ino;
-					}
-				}
-			}
-			/* Update the changed pointers in the saved context. */
-			if (ctx->mrec != old_ctx.mrec) {
-				if (!IS_ERR(ctx->mrec))
-					old_ctx.attr = (ATTR_RECORD*)(
-							(u8*)ctx->mrec +
-							((u8*)old_ctx.attr -
-							(u8*)old_ctx.mrec));
-				old_ctx.mrec = ctx->mrec;
-			}
-		}
-		/* Restore the search context to the saved one. */
-		*ctx = old_ctx;
-		/*
-		 * We drop the reference on the page we took earlier.  In the
-		 * case that IS_ERR(ctx->mrec) is true this means we might lose
-		 * some changes to the mft record that had been made between
-		 * the last time it was marked dirty/written out and now.  This
-		 * at this stage is not a problem as the mapping error is fatal
-		 * enough that the mft record cannot be written out anyway and
-		 * the caller is very likely to shutdown the whole inode
-		 * immediately and mark the volume dirty for chkdsk to pick up
-		 * the pieces anyway.
-		 */
-		if (put_this_page)
-			put_page(put_this_page);
-	}
-	return err;
-}
-
-/**
- * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
- * @ni:		ntfs inode for which to map (part of) a runlist
- * @vcn:	map runlist part containing this vcn
- *
- * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
- *
- * Return 0 on success and -errno on error.  There is one special error code
- * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
- * of bounds of the runlist.
- *
- * Locking: - The runlist must be unlocked on entry and is unlocked on return.
- *	    - This function takes the runlist lock for writing and may modify
- *	      the runlist.
- */
-int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
-{
-	int err = 0;
-
-	down_write(&ni->runlist.lock);
-	/* Make sure someone else didn't do the work while we were sleeping. */
-	if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
-			LCN_RL_NOT_MAPPED))
-		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
-	up_write(&ni->runlist.lock);
-	return err;
-}
-
-/**
- * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode
- * @ni:			ntfs inode of the attribute whose runlist to search
- * @vcn:		vcn to convert
- * @write_locked:	true if the runlist is locked for writing
- *
- * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
- * described by the ntfs inode @ni and return the corresponding logical cluster
- * number (lcn).
- *
- * If the @vcn is not mapped yet, the attempt is made to map the attribute
- * extent containing the @vcn and the vcn to lcn conversion is retried.
- *
- * If @write_locked is true the caller has locked the runlist for writing and
- * if false for reading.
- *
- * Since lcns must be >= 0, we use negative return codes with special meaning:
- *
- * Return code	Meaning / Description
- * ==========================================
- *  LCN_HOLE	Hole / not allocated on disk.
- *  LCN_ENOENT	There is no such vcn in the runlist, i.e. @vcn is out of bounds.
- *  LCN_ENOMEM	Not enough memory to map runlist.
- *  LCN_EIO	Critical error (runlist/file is corrupt, i/o error, etc).
- *
- * Locking: - The runlist must be locked on entry and is left locked on return.
- *	    - If @write_locked is 'false', i.e. the runlist is locked for reading,
- *	      the lock may be dropped inside the function so you cannot rely on
- *	      the runlist still being the same when this function returns.
- */
-LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
-		const bool write_locked)
-{
-	LCN lcn;
-	unsigned long flags;
-	bool is_retry = false;
-
-	BUG_ON(!ni);
-	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
-			ni->mft_no, (unsigned long long)vcn,
-			write_locked ? "write" : "read");
-	BUG_ON(!NInoNonResident(ni));
-	BUG_ON(vcn < 0);
-	if (!ni->runlist.rl) {
-		read_lock_irqsave(&ni->size_lock, flags);
-		if (!ni->allocated_size) {
-			read_unlock_irqrestore(&ni->size_lock, flags);
-			return LCN_ENOENT;
-		}
-		read_unlock_irqrestore(&ni->size_lock, flags);
-	}
-retry_remap:
-	/* Convert vcn to lcn.  If that fails map the runlist and retry once. */
-	lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
-	if (likely(lcn >= LCN_HOLE)) {
-		ntfs_debug("Done, lcn 0x%llx.", (long long)lcn);
-		return lcn;
-	}
-	if (lcn != LCN_RL_NOT_MAPPED) {
-		if (lcn != LCN_ENOENT)
-			lcn = LCN_EIO;
-	} else if (!is_retry) {
-		int err;
-
-		if (!write_locked) {
-			up_read(&ni->runlist.lock);
-			down_write(&ni->runlist.lock);
-			if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
-					LCN_RL_NOT_MAPPED)) {
-				up_write(&ni->runlist.lock);
-				down_read(&ni->runlist.lock);
-				goto retry_remap;
-			}
-		}
-		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
-		if (!write_locked) {
-			up_write(&ni->runlist.lock);
-			down_read(&ni->runlist.lock);
-		}
-		if (likely(!err)) {
-			is_retry = true;
-			goto retry_remap;
-		}
-		if (err == -ENOENT)
-			lcn = LCN_ENOENT;
-		else if (err == -ENOMEM)
-			lcn = LCN_ENOMEM;
-		else
-			lcn = LCN_EIO;
-	}
-	if (lcn != LCN_ENOENT)
-		ntfs_error(ni->vol->sb, "Failed with error code %lli.",
-				(long long)lcn);
-	return lcn;
-}
-
-/**
- * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
- * @ni:		ntfs inode describing the runlist to search
- * @vcn:	vcn to find
- * @ctx:	active attribute search context if present or NULL if not
- *
- * Find the virtual cluster number @vcn in the runlist described by the ntfs
- * inode @ni and return the address of the runlist element containing the @vcn.
- *
- * If the @vcn is not mapped yet, the attempt is made to map the attribute
- * extent containing the @vcn and the vcn to lcn conversion is retried.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record.  This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
- * runlist fragments and allows their mapping.  If you do not have the mft
- * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
- * will perform the necessary mapping and unmapping.
- *
- * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
- * restores it before returning.  Thus, @ctx will be left pointing to the same
- * attribute on return as on entry.  However, the actual pointers in @ctx may
- * point to different memory locations on return, so you must remember to reset
- * any cached pointers from the @ctx, i.e. after the call to
- * ntfs_attr_find_vcn_nolock(), you will probably want to do:
- *	m = ctx->mrec;
- *	a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- * Note you need to distinguish between the lcn of the returned runlist element
- * being >= 0 and LCN_HOLE.  In the later case you have to return zeroes on
- * read and allocate clusters on write.
- *
- * Return the runlist element containing the @vcn on success and
- * ERR_PTR(-errno) on error.  You need to test the return value with IS_ERR()
- * to decide if the return is success or failure and PTR_ERR() to get to the
- * error code if IS_ERR() is true.
- *
- * The possible error return codes are:
- *	-ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds.
- *	-ENOMEM - Not enough memory to map runlist.
- *	-EIO	- Critical error (runlist/file is corrupt, i/o error, etc).
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- *	    is no longer valid, i.e. you need to either call
- *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
- *	    why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- *	      and is locked on return.  Note the runlist may be modified when
- *	      needed runlist fragments need to be mapped.
- *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
- *	      entry and it will be left unmapped on return.
- *	    - If @ctx is not NULL, the base mft record must be mapped on entry
- *	      and it will be left mapped on return.
- */
-runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
-		ntfs_attr_search_ctx *ctx)
-{
-	unsigned long flags;
-	runlist_element *rl;
-	int err = 0;
-	bool is_retry = false;
-
-	BUG_ON(!ni);
-	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
-			ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
-	BUG_ON(!NInoNonResident(ni));
-	BUG_ON(vcn < 0);
-	if (!ni->runlist.rl) {
-		read_lock_irqsave(&ni->size_lock, flags);
-		if (!ni->allocated_size) {
-			read_unlock_irqrestore(&ni->size_lock, flags);
-			return ERR_PTR(-ENOENT);
-		}
-		read_unlock_irqrestore(&ni->size_lock, flags);
-	}
-retry_remap:
-	rl = ni->runlist.rl;
-	if (likely(rl && vcn >= rl[0].vcn)) {
-		while (likely(rl->length)) {
-			if (unlikely(vcn < rl[1].vcn)) {
-				if (likely(rl->lcn >= LCN_HOLE)) {
-					ntfs_debug("Done.");
-					return rl;
-				}
-				break;
-			}
-			rl++;
-		}
-		if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
-			if (likely(rl->lcn == LCN_ENOENT))
-				err = -ENOENT;
-			else
-				err = -EIO;
-		}
-	}
-	if (!err && !is_retry) {
-		/*
-		 * If the search context is invalid we cannot map the unmapped
-		 * region.
-		 */
-		if (IS_ERR(ctx->mrec))
-			err = PTR_ERR(ctx->mrec);
-		else {
-			/*
-			 * The @vcn is in an unmapped region, map the runlist
-			 * and retry.
-			 */
-			err = ntfs_map_runlist_nolock(ni, vcn, ctx);
-			if (likely(!err)) {
-				is_retry = true;
-				goto retry_remap;
-			}
-		}
-		if (err == -EINVAL)
-			err = -EIO;
-	} else if (!err)
-		err = -EIO;
-	if (err != -ENOENT)
-		ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
-	return ERR_PTR(err);
-}
-
-/**
- * ntfs_attr_find - find (next) attribute in mft record
- * @type:	attribute type to find
- * @name:	attribute name to find (optional, i.e. NULL means don't care)
- * @name_len:	attribute name length (only needed if @name present)
- * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @val:	attribute value to find (optional, resident attributes only)
- * @val_len:	attribute value length
- * @ctx:	search context with mft record and attribute to search from
- *
- * You should not need to call this function directly.  Use ntfs_attr_lookup()
- * instead.
- *
- * ntfs_attr_find() takes a search context @ctx as parameter and searches the
- * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an
- * attribute of @type, optionally @name and @val.
- *
- * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will
- * point to the found attribute.
- *
- * If the attribute is not found, ntfs_attr_find() returns -ENOENT and
- * @ctx->attr will point to the attribute before which the attribute being
- * searched for would need to be inserted if such an action were to be desired.
- *
- * On actual error, ntfs_attr_find() returns -EIO.  In this case @ctx->attr is
- * undefined and in particular do not rely on it not changing.
- *
- * If @ctx->is_first is 'true', the search begins with @ctx->attr itself.  If it
- * is 'false', the search begins after @ctx->attr.
- *
- * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
- * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
- * @ctx->mrec belongs.  This is so we can get at the ntfs volume and hence at
- * the upcase table.  If @ic is CASE_SENSITIVE, the comparison is case
- * sensitive.  When @name is present, @name_len is the @name length in Unicode
- * characters.
- *
- * If @name is not present (NULL), we assume that the unnamed attribute is
- * being searched for.
- *
- * Finally, the resident attribute value @val is looked for, if present.  If
- * @val is not present (NULL), @val_len is ignored.
- *
- * ntfs_attr_find() only searches the specified mft record and it ignores the
- * presence of an attribute list attribute (unless it is the one being searched
- * for, obviously).  If you need to take attribute lists into consideration,
- * use ntfs_attr_lookup() instead (see below).  This also means that you cannot
- * use ntfs_attr_find() to search for extent records of non-resident
- * attributes, as extents with lowest_vcn != 0 are usually described by the
- * attribute list attribute only. - Note that it is possible that the first
- * extent is only in the attribute list while the last extent is in the base
- * mft record, so do not rely on being able to find the first extent in the
- * base mft record.
- *
- * Warning: Never use @val when looking for attribute types which can be
- *	    non-resident as this most likely will result in a crash!
- */
-static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
-		const u32 name_len, const IGNORE_CASE_BOOL ic,
-		const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
-{
-	ATTR_RECORD *a;
-	ntfs_volume *vol = ctx->ntfs_ino->vol;
-	ntfschar *upcase = vol->upcase;
-	u32 upcase_len = vol->upcase_len;
-
-	/*
-	 * Iterate over attributes in mft record starting at @ctx->attr, or the
-	 * attribute following that, if @ctx->is_first is 'true'.
-	 */
-	if (ctx->is_first) {
-		a = ctx->attr;
-		ctx->is_first = false;
-	} else
-		a = (ATTR_RECORD*)((u8*)ctx->attr +
-				le32_to_cpu(ctx->attr->length));
-	for (;;	a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
-		u8 *mrec_end = (u8 *)ctx->mrec +
-		               le32_to_cpu(ctx->mrec->bytes_allocated);
-		u8 *name_end;
-
-		/* check whether ATTR_RECORD wrap */
-		if ((u8 *)a < (u8 *)ctx->mrec)
-			break;
-
-		/* check whether Attribute Record Header is within bounds */
-		if ((u8 *)a > mrec_end ||
-		    (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
-			break;
-
-		/* check whether ATTR_RECORD's name is within bounds */
-		name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
-			   a->name_length * sizeof(ntfschar);
-		if (name_end > mrec_end)
-			break;
-
-		ctx->attr = a;
-		if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
-				a->type == AT_END))
-			return -ENOENT;
-		if (unlikely(!a->length))
-			break;
-
-		/* check whether ATTR_RECORD's length wrap */
-		if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a)
-			break;
-		/* check whether ATTR_RECORD's length is within bounds */
-		if ((u8 *)a + le32_to_cpu(a->length) > mrec_end)
-			break;
-
-		if (a->type != type)
-			continue;
-		/*
-		 * If @name is present, compare the two names.  If @name is
-		 * missing, assume we want an unnamed attribute.
-		 */
-		if (!name) {
-			/* The search failed if the found attribute is named. */
-			if (a->name_length)
-				return -ENOENT;
-		} else if (!ntfs_are_names_equal(name, name_len,
-			    (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)),
-			    a->name_length, ic, upcase, upcase_len)) {
-			register int rc;
-
-			rc = ntfs_collate_names(name, name_len,
-					(ntfschar*)((u8*)a +
-					le16_to_cpu(a->name_offset)),
-					a->name_length, 1, IGNORE_CASE,
-					upcase, upcase_len);
-			/*
-			 * If @name collates before a->name, there is no
-			 * matching attribute.
-			 */
-			if (rc == -1)
-				return -ENOENT;
-			/* If the strings are not equal, continue search. */
-			if (rc)
-				continue;
-			rc = ntfs_collate_names(name, name_len,
-					(ntfschar*)((u8*)a +
-					le16_to_cpu(a->name_offset)),
-					a->name_length, 1, CASE_SENSITIVE,
-					upcase, upcase_len);
-			if (rc == -1)
-				return -ENOENT;
-			if (rc)
-				continue;
-		}
-		/*
-		 * The names match or @name not present and attribute is
-		 * unnamed.  If no @val specified, we have found the attribute
-		 * and are done.
-		 */
-		if (!val)
-			return 0;
-		/* @val is present; compare values. */
-		else {
-			register int rc;
-
-			rc = memcmp(val, (u8*)a + le16_to_cpu(
-					a->data.resident.value_offset),
-					min_t(u32, val_len, le32_to_cpu(
-					a->data.resident.value_length)));
-			/*
-			 * If @val collates before the current attribute's
-			 * value, there is no matching attribute.
-			 */
-			if (!rc) {
-				register u32 avl;
-
-				avl = le32_to_cpu(
-						a->data.resident.value_length);
-				if (val_len == avl)
-					return 0;
-				if (val_len < avl)
-					return -ENOENT;
-			} else if (rc < 0)
-				return -ENOENT;
-		}
-	}
-	ntfs_error(vol->sb, "Inode is corrupt.  Run chkdsk.");
-	NVolSetErrors(vol);
-	return -EIO;
-}
-
-/**
- * load_attribute_list - load an attribute list into memory
- * @vol:		ntfs volume from which to read
- * @runlist:		runlist of the attribute list
- * @al_start:		destination buffer
- * @size:		size of the destination buffer in bytes
- * @initialized_size:	initialized size of the attribute list
- *
- * Walk the runlist @runlist and load all clusters from it copying them into
- * the linear buffer @al. The maximum number of bytes copied to @al is @size
- * bytes. Note, @size does not need to be a multiple of the cluster size. If
- * @initialized_size is less than @size, the region in @al between
- * @initialized_size and @size will be zeroed and not read from disk.
- *
- * Return 0 on success or -errno on error.
- */
-int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
-		const s64 size, const s64 initialized_size)
-{
-	LCN lcn;
-	u8 *al = al_start;
-	u8 *al_end = al + initialized_size;
-	runlist_element *rl;
-	struct buffer_head *bh;
-	struct super_block *sb;
-	unsigned long block_size;
-	unsigned long block, max_block;
-	int err = 0;
-	unsigned char block_size_bits;
-
-	ntfs_debug("Entering.");
-	if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 ||
-			initialized_size > size)
-		return -EINVAL;
-	if (!initialized_size) {
-		memset(al, 0, size);
-		return 0;
-	}
-	sb = vol->sb;
-	block_size = sb->s_blocksize;
-	block_size_bits = sb->s_blocksize_bits;
-	down_read(&runlist->lock);
-	rl = runlist->rl;
-	if (!rl) {
-		ntfs_error(sb, "Cannot read attribute list since runlist is "
-				"missing.");
-		goto err_out;	
-	}
-	/* Read all clusters specified by the runlist one run at a time. */
-	while (rl->length) {
-		lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
-		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
-				(unsigned long long)rl->vcn,
-				(unsigned long long)lcn);
-		/* The attribute list cannot be sparse. */
-		if (lcn < 0) {
-			ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed.  Cannot "
-					"read attribute list.");
-			goto err_out;
-		}
-		block = lcn << vol->cluster_size_bits >> block_size_bits;
-		/* Read the run from device in chunks of block_size bytes. */
-		max_block = block + (rl->length << vol->cluster_size_bits >>
-				block_size_bits);
-		ntfs_debug("max_block = 0x%lx.", max_block);
-		do {
-			ntfs_debug("Reading block = 0x%lx.", block);
-			bh = sb_bread(sb, block);
-			if (!bh) {
-				ntfs_error(sb, "sb_bread() failed. Cannot "
-						"read attribute list.");
-				goto err_out;
-			}
-			if (al + block_size >= al_end)
-				goto do_final;
-			memcpy(al, bh->b_data, block_size);
-			brelse(bh);
-			al += block_size;
-		} while (++block < max_block);
-		rl++;
-	}
-	if (initialized_size < size) {
-initialize:
-		memset(al_start + initialized_size, 0, size - initialized_size);
-	}
-done:
-	up_read(&runlist->lock);
-	return err;
-do_final:
-	if (al < al_end) {
-		/*
-		 * Partial block.
-		 *
-		 * Note: The attribute list can be smaller than its allocation
-		 * by multiple clusters.  This has been encountered by at least
-		 * two people running Windows XP, thus we cannot do any
-		 * truncation sanity checking here. (AIA)
-		 */
-		memcpy(al, bh->b_data, al_end - al);
-		brelse(bh);
-		if (initialized_size < size)
-			goto initialize;
-		goto done;
-	}
-	brelse(bh);
-	/* Real overflow! */
-	ntfs_error(sb, "Attribute list buffer overflow. Read attribute list "
-			"is truncated.");
-err_out:
-	err = -EIO;
-	goto done;
-}
-
-/**
- * ntfs_external_attr_find - find an attribute in the attribute list of an inode
- * @type:	attribute type to find
- * @name:	attribute name to find (optional, i.e. NULL means don't care)
- * @name_len:	attribute name length (only needed if @name present)
- * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
- * @val:	attribute value to find (optional, resident attributes only)
- * @val_len:	attribute value length
- * @ctx:	search context with mft record and attribute to search from
- *
- * You should not need to call this function directly.  Use ntfs_attr_lookup()
- * instead.
- *
- * Find an attribute by searching the attribute list for the corresponding
- * attribute list entry.  Having found the entry, map the mft record if the
- * attribute is in a different mft record/inode, ntfs_attr_find() the attribute
- * in there and return it.
- *
- * On first search @ctx->ntfs_ino must be the base mft record and @ctx must
- * have been obtained from a call to ntfs_attr_get_search_ctx().  On subsequent
- * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is
- * then the base inode).
- *
- * After finishing with the attribute/mft record you need to call
- * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
- * mapped inodes, etc).
- *
- * If the attribute is found, ntfs_external_attr_find() returns 0 and
- * @ctx->attr will point to the found attribute.  @ctx->mrec will point to the
- * mft record in which @ctx->attr is located and @ctx->al_entry will point to
- * the attribute list entry for the attribute.
- *
- * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and
- * @ctx->attr will point to the attribute in the base mft record before which
- * the attribute being searched for would need to be inserted if such an action
- * were to be desired.  @ctx->mrec will point to the mft record in which
- * @ctx->attr is located and @ctx->al_entry will point to the attribute list
- * entry of the attribute before which the attribute being searched for would
- * need to be inserted if such an action were to be desired.
- *
- * Thus to insert the not found attribute, one wants to add the attribute to
- * @ctx->mrec (the base mft record) and if there is not enough space, the
- * attribute should be placed in a newly allocated extent mft record.  The
- * attribute list entry for the inserted attribute should be inserted in the
- * attribute list attribute at @ctx->al_entry.
- *
- * On actual error, ntfs_external_attr_find() returns -EIO.  In this case
- * @ctx->attr is undefined and in particular do not rely on it not changing.
- */
-static int ntfs_external_attr_find(const ATTR_TYPE type,
-		const ntfschar *name, const u32 name_len,
-		const IGNORE_CASE_BOOL ic, const VCN lowest_vcn,
-		const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
-{
-	ntfs_inode *base_ni, *ni;
-	ntfs_volume *vol;
-	ATTR_LIST_ENTRY *al_entry, *next_al_entry;
-	u8 *al_start, *al_end;
-	ATTR_RECORD *a;
-	ntfschar *al_name;
-	u32 al_name_len;
-	int err = 0;
-	static const char *es = " Unmount and run chkdsk.";
-
-	ni = ctx->ntfs_ino;
-	base_ni = ctx->base_ntfs_ino;
-	ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type);
-	if (!base_ni) {
-		/* First call happens with the base mft record. */
-		base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino;
-		ctx->base_mrec = ctx->mrec;
-	}
-	if (ni == base_ni)
-		ctx->base_attr = ctx->attr;
-	if (type == AT_END)
-		goto not_found;
-	vol = base_ni->vol;
-	al_start = base_ni->attr_list;
-	al_end = al_start + base_ni->attr_list_size;
-	if (!ctx->al_entry)
-		ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
-	/*
-	 * Iterate over entries in attribute list starting at @ctx->al_entry,
-	 * or the entry following that, if @ctx->is_first is 'true'.
-	 */
-	if (ctx->is_first) {
-		al_entry = ctx->al_entry;
-		ctx->is_first = false;
-	} else
-		al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
-				le16_to_cpu(ctx->al_entry->length));
-	for (;; al_entry = next_al_entry) {
-		/* Out of bounds check. */
-		if ((u8*)al_entry < base_ni->attr_list ||
-				(u8*)al_entry > al_end)
-			break;	/* Inode is corrupt. */
-		ctx->al_entry = al_entry;
-		/* Catch the end of the attribute list. */
-		if ((u8*)al_entry == al_end)
-			goto not_found;
-		if (!al_entry->length)
-			break;
-		if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
-				le16_to_cpu(al_entry->length) > al_end)
-			break;
-		next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
-				le16_to_cpu(al_entry->length));
-		if (le32_to_cpu(al_entry->type) > le32_to_cpu(type))
-			goto not_found;
-		if (type != al_entry->type)
-			continue;
-		/*
-		 * If @name is present, compare the two names.  If @name is
-		 * missing, assume we want an unnamed attribute.
-		 */
-		al_name_len = al_entry->name_length;
-		al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
-		if (!name) {
-			if (al_name_len)
-				goto not_found;
-		} else if (!ntfs_are_names_equal(al_name, al_name_len, name,
-				name_len, ic, vol->upcase, vol->upcase_len)) {
-			register int rc;
-
-			rc = ntfs_collate_names(name, name_len, al_name,
-					al_name_len, 1, IGNORE_CASE,
-					vol->upcase, vol->upcase_len);
-			/*
-			 * If @name collates before al_name, there is no
-			 * matching attribute.
-			 */
-			if (rc == -1)
-				goto not_found;
-			/* If the strings are not equal, continue search. */
-			if (rc)
-				continue;
-			/*
-			 * FIXME: Reverse engineering showed 0, IGNORE_CASE but
-			 * that is inconsistent with ntfs_attr_find().  The
-			 * subsequent rc checks were also different.  Perhaps I
-			 * made a mistake in one of the two.  Need to recheck
-			 * which is correct or at least see what is going on...
-			 * (AIA)
-			 */
-			rc = ntfs_collate_names(name, name_len, al_name,
-					al_name_len, 1, CASE_SENSITIVE,
-					vol->upcase, vol->upcase_len);
-			if (rc == -1)
-				goto not_found;
-			if (rc)
-				continue;
-		}
-		/*
-		 * The names match or @name not present and attribute is
-		 * unnamed.  Now check @lowest_vcn.  Continue search if the
-		 * next attribute list entry still fits @lowest_vcn.  Otherwise
-		 * we have reached the right one or the search has failed.
-		 */
-		if (lowest_vcn && (u8*)next_al_entry >= al_start	    &&
-				(u8*)next_al_entry + 6 < al_end		    &&
-				(u8*)next_al_entry + le16_to_cpu(
-					next_al_entry->length) <= al_end    &&
-				sle64_to_cpu(next_al_entry->lowest_vcn) <=
-					lowest_vcn			    &&
-				next_al_entry->type == al_entry->type	    &&
-				next_al_entry->name_length == al_name_len   &&
-				ntfs_are_names_equal((ntfschar*)((u8*)
-					next_al_entry +
-					next_al_entry->name_offset),
-					next_al_entry->name_length,
-					al_name, al_name_len, CASE_SENSITIVE,
-					vol->upcase, vol->upcase_len))
-			continue;
-		if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
-			if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
-				ntfs_error(vol->sb, "Found stale mft "
-						"reference in attribute list "
-						"of base inode 0x%lx.%s",
-						base_ni->mft_no, es);
-				err = -EIO;
-				break;
-			}
-		} else { /* Mft references do not match. */
-			/* If there is a mapped record unmap it first. */
-			if (ni != base_ni)
-				unmap_extent_mft_record(ni);
-			/* Do we want the base record back? */
-			if (MREF_LE(al_entry->mft_reference) ==
-					base_ni->mft_no) {
-				ni = ctx->ntfs_ino = base_ni;
-				ctx->mrec = ctx->base_mrec;
-			} else {
-				/* We want an extent record. */
-				ctx->mrec = map_extent_mft_record(base_ni,
-						le64_to_cpu(
-						al_entry->mft_reference), &ni);
-				if (IS_ERR(ctx->mrec)) {
-					ntfs_error(vol->sb, "Failed to map "
-							"extent mft record "
-							"0x%lx of base inode "
-							"0x%lx.%s",
-							MREF_LE(al_entry->
-							mft_reference),
-							base_ni->mft_no, es);
-					err = PTR_ERR(ctx->mrec);
-					if (err == -ENOENT)
-						err = -EIO;
-					/* Cause @ctx to be sanitized below. */
-					ni = NULL;
-					break;
-				}
-				ctx->ntfs_ino = ni;
-			}
-			ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
-					le16_to_cpu(ctx->mrec->attrs_offset));
-		}
-		/*
-		 * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the
-		 * mft record containing the attribute represented by the
-		 * current al_entry.
-		 */
-		/*
-		 * We could call into ntfs_attr_find() to find the right
-		 * attribute in this mft record but this would be less
-		 * efficient and not quite accurate as ntfs_attr_find() ignores
-		 * the attribute instance numbers for example which become
-		 * important when one plays with attribute lists.  Also,
-		 * because a proper match has been found in the attribute list
-		 * entry above, the comparison can now be optimized.  So it is
-		 * worth re-implementing a simplified ntfs_attr_find() here.
-		 */
-		a = ctx->attr;
-		/*
-		 * Use a manual loop so we can still use break and continue
-		 * with the same meanings as above.
-		 */
-do_next_attr_loop:
-		if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
-				le32_to_cpu(ctx->mrec->bytes_allocated))
-			break;
-		if (a->type == AT_END)
-			break;
-		if (!a->length)
-			break;
-		if (al_entry->instance != a->instance)
-			goto do_next_attr;
-		/*
-		 * If the type and/or the name are mismatched between the
-		 * attribute list entry and the attribute record, there is
-		 * corruption so we break and return error EIO.
-		 */
-		if (al_entry->type != a->type)
-			break;
-		if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
-				le16_to_cpu(a->name_offset)), a->name_length,
-				al_name, al_name_len, CASE_SENSITIVE,
-				vol->upcase, vol->upcase_len))
-			break;
-		ctx->attr = a;
-		/*
-		 * If no @val specified or @val specified and it matches, we
-		 * have found it!
-		 */
-		if (!val || (!a->non_resident && le32_to_cpu(
-				a->data.resident.value_length) == val_len &&
-				!memcmp((u8*)a +
-				le16_to_cpu(a->data.resident.value_offset),
-				val, val_len))) {
-			ntfs_debug("Done, found.");
-			return 0;
-		}
-do_next_attr:
-		/* Proceed to the next attribute in the current mft record. */
-		a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
-		goto do_next_attr_loop;
-	}
-	if (!err) {
-		ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt "
-				"attribute list attribute.%s", base_ni->mft_no,
-				es);
-		err = -EIO;
-	}
-	if (ni != base_ni) {
-		if (ni)
-			unmap_extent_mft_record(ni);
-		ctx->ntfs_ino = base_ni;
-		ctx->mrec = ctx->base_mrec;
-		ctx->attr = ctx->base_attr;
-	}
-	if (err != -ENOMEM)
-		NVolSetErrors(vol);
-	return err;
-not_found:
-	/*
-	 * If we were looking for AT_END, we reset the search context @ctx and
-	 * use ntfs_attr_find() to seek to the end of the base mft record.
-	 */
-	if (type == AT_END) {
-		ntfs_attr_reinit_search_ctx(ctx);
-		return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len,
-				ctx);
-	}
-	/*
-	 * The attribute was not found.  Before we return, we want to ensure
-	 * @ctx->mrec and @ctx->attr indicate the position at which the
-	 * attribute should be inserted in the base mft record.  Since we also
-	 * want to preserve @ctx->al_entry we cannot reinitialize the search
-	 * context using ntfs_attr_reinit_search_ctx() as this would set
-	 * @ctx->al_entry to NULL.  Thus we do the necessary bits manually (see
-	 * ntfs_attr_init_search_ctx() below).  Note, we _only_ preserve
-	 * @ctx->al_entry as the remaining fields (base_*) are identical to
-	 * their non base_ counterparts and we cannot set @ctx->base_attr
-	 * correctly yet as we do not know what @ctx->attr will be set to by
-	 * the call to ntfs_attr_find() below.
-	 */
-	if (ni != base_ni)
-		unmap_extent_mft_record(ni);
-	ctx->mrec = ctx->base_mrec;
-	ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
-			le16_to_cpu(ctx->mrec->attrs_offset));
-	ctx->is_first = true;
-	ctx->ntfs_ino = base_ni;
-	ctx->base_ntfs_ino = NULL;
-	ctx->base_mrec = NULL;
-	ctx->base_attr = NULL;
-	/*
-	 * In case there are multiple matches in the base mft record, need to
-	 * keep enumerating until we get an attribute not found response (or
-	 * another error), otherwise we would keep returning the same attribute
-	 * over and over again and all programs using us for enumeration would
-	 * lock up in a tight loop.
-	 */
-	do {
-		err = ntfs_attr_find(type, name, name_len, ic, val, val_len,
-				ctx);
-	} while (!err);
-	ntfs_debug("Done, not found.");
-	return err;
-}
-
-/**
- * ntfs_attr_lookup - find an attribute in an ntfs inode
- * @type:	attribute type to find
- * @name:	attribute name to find (optional, i.e. NULL means don't care)
- * @name_len:	attribute name length (only needed if @name present)
- * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
- * @val:	attribute value to find (optional, resident attributes only)
- * @val_len:	attribute value length
- * @ctx:	search context with mft record and attribute to search from
- *
- * Find an attribute in an ntfs inode.  On first search @ctx->ntfs_ino must
- * be the base mft record and @ctx must have been obtained from a call to
- * ntfs_attr_get_search_ctx().
- *
- * This function transparently handles attribute lists and @ctx is used to
- * continue searches where they were left off at.
- *
- * After finishing with the attribute/mft record you need to call
- * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
- * mapped inodes, etc).
- *
- * Return 0 if the search was successful and -errno if not.
- *
- * When 0, @ctx->attr is the found attribute and it is in mft record
- * @ctx->mrec.  If an attribute list attribute is present, @ctx->al_entry is
- * the attribute list entry of the found attribute.
- *
- * When -ENOENT, @ctx->attr is the attribute which collates just after the
- * attribute being searched for, i.e. if one wants to add the attribute to the
- * mft record this is the correct place to insert it into.  If an attribute
- * list attribute is present, @ctx->al_entry is the attribute list entry which
- * collates just after the attribute list entry of the attribute being searched
- * for, i.e. if one wants to add the attribute to the mft record this is the
- * correct place to insert its attribute list entry into.
- *
- * When -errno != -ENOENT, an error occurred during the lookup.  @ctx->attr is
- * then undefined and in particular you should not rely on it not changing.
- */
-int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
-		const u32 name_len, const IGNORE_CASE_BOOL ic,
-		const VCN lowest_vcn, const u8 *val, const u32 val_len,
-		ntfs_attr_search_ctx *ctx)
-{
-	ntfs_inode *base_ni;
-
-	ntfs_debug("Entering.");
-	BUG_ON(IS_ERR(ctx->mrec));
-	if (ctx->base_ntfs_ino)
-		base_ni = ctx->base_ntfs_ino;
-	else
-		base_ni = ctx->ntfs_ino;
-	/* Sanity check, just for debugging really. */
-	BUG_ON(!base_ni);
-	if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
-		return ntfs_attr_find(type, name, name_len, ic, val, val_len,
-				ctx);
-	return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn,
-			val, val_len, ctx);
-}
-
-/**
- * ntfs_attr_init_search_ctx - initialize an attribute search context
- * @ctx:	attribute search context to initialize
- * @ni:		ntfs inode with which to initialize the search context
- * @mrec:	mft record with which to initialize the search context
- *
- * Initialize the attribute search context @ctx with @ni and @mrec.
- */
-static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
-		ntfs_inode *ni, MFT_RECORD *mrec)
-{
-	*ctx = (ntfs_attr_search_ctx) {
-		.mrec = mrec,
-		/* Sanity checks are performed elsewhere. */
-		.attr = (ATTR_RECORD*)((u8*)mrec +
-				le16_to_cpu(mrec->attrs_offset)),
-		.is_first = true,
-		.ntfs_ino = ni,
-	};
-}
-
-/**
- * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context
- * @ctx:	attribute search context to reinitialize
- *
- * Reinitialize the attribute search context @ctx, unmapping an associated
- * extent mft record if present, and initialize the search context again.
- *
- * This is used when a search for a new attribute is being started to reset
- * the search context to the beginning.
- */
-void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
-{
-	if (likely(!ctx->base_ntfs_ino)) {
-		/* No attribute list. */
-		ctx->is_first = true;
-		/* Sanity checks are performed elsewhere. */
-		ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
-				le16_to_cpu(ctx->mrec->attrs_offset));
-		/*
-		 * This needs resetting due to ntfs_external_attr_find() which
-		 * can leave it set despite having zeroed ctx->base_ntfs_ino.
-		 */
-		ctx->al_entry = NULL;
-		return;
-	} /* Attribute list. */
-	if (ctx->ntfs_ino != ctx->base_ntfs_ino)
-		unmap_extent_mft_record(ctx->ntfs_ino);
-	ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec);
-	return;
-}
-
-/**
- * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context
- * @ni:		ntfs inode with which to initialize the search context
- * @mrec:	mft record with which to initialize the search context
- *
- * Allocate a new attribute search context, initialize it with @ni and @mrec,
- * and return it. Return NULL if allocation failed.
- */
-ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
-{
-	ntfs_attr_search_ctx *ctx;
-
-	ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS);
-	if (ctx)
-		ntfs_attr_init_search_ctx(ctx, ni, mrec);
-	return ctx;
-}
-
-/**
- * ntfs_attr_put_search_ctx - release an attribute search context
- * @ctx:	attribute search context to free
- *
- * Release the attribute search context @ctx, unmapping an associated extent
- * mft record if present.
- */
-void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
-{
-	if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino)
-		unmap_extent_mft_record(ctx->ntfs_ino);
-	kmem_cache_free(ntfs_attr_ctx_cache, ctx);
-	return;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
- * @vol:	ntfs volume to which the attribute belongs
- * @type:	attribute type which to find
- *
- * Search for the attribute definition record corresponding to the attribute
- * @type in the $AttrDef system file.
- *
- * Return the attribute type definition record if found and NULL if not found.
- */
-static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
-		const ATTR_TYPE type)
-{
-	ATTR_DEF *ad;
-
-	BUG_ON(!vol->attrdef);
-	BUG_ON(!type);
-	for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
-			vol->attrdef_size && ad->type; ++ad) {
-		/* We have not found it yet, carry on searching. */
-		if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type)))
-			continue;
-		/* We found the attribute; return it. */
-		if (likely(ad->type == type))
-			return ad;
-		/* We have gone too far already.  No point in continuing. */
-		break;
-	}
-	/* Attribute not found. */
-	ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
-			le32_to_cpu(type));
-	return NULL;
-}
-
-/**
- * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
- * @vol:	ntfs volume to which the attribute belongs
- * @type:	attribute type which to check
- * @size:	size which to check
- *
- * Check whether the @size in bytes is valid for an attribute of @type on the
- * ntfs volume @vol.  This information is obtained from $AttrDef system file.
- *
- * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not
- * listed in $AttrDef.
- */
-int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
-		const s64 size)
-{
-	ATTR_DEF *ad;
-
-	BUG_ON(size < 0);
-	/*
-	 * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
-	 * listed in $AttrDef.
-	 */
-	if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024))
-		return -ERANGE;
-	/* Get the $AttrDef entry for the attribute @type. */
-	ad = ntfs_attr_find_in_attrdef(vol, type);
-	if (unlikely(!ad))
-		return -ENOENT;
-	/* Do the bounds check. */
-	if (((sle64_to_cpu(ad->min_size) > 0) &&
-			size < sle64_to_cpu(ad->min_size)) ||
-			((sle64_to_cpu(ad->max_size) > 0) && size >
-			sle64_to_cpu(ad->max_size)))
-		return -ERANGE;
-	return 0;
-}
-
-/**
- * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
- * @vol:	ntfs volume to which the attribute belongs
- * @type:	attribute type which to check
- *
- * Check whether the attribute of @type on the ntfs volume @vol is allowed to
- * be non-resident.  This information is obtained from $AttrDef system file.
- *
- * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and
- * -ENOENT if the attribute is not listed in $AttrDef.
- */
-int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
-{
-	ATTR_DEF *ad;
-
-	/* Find the attribute definition record in $AttrDef. */
-	ad = ntfs_attr_find_in_attrdef(vol, type);
-	if (unlikely(!ad))
-		return -ENOENT;
-	/* Check the flags and return the result. */
-	if (ad->flags & ATTR_DEF_RESIDENT)
-		return -EPERM;
-	return 0;
-}
-
-/**
- * ntfs_attr_can_be_resident - check if an attribute can be resident
- * @vol:	ntfs volume to which the attribute belongs
- * @type:	attribute type which to check
- *
- * Check whether the attribute of @type on the ntfs volume @vol is allowed to
- * be resident.  This information is derived from our ntfs knowledge and may
- * not be completely accurate, especially when user defined attributes are
- * present.  Basically we allow everything to be resident except for index
- * allocation and $EA attributes.
- *
- * Return 0 if the attribute is allowed to be non-resident and -EPERM if not.
- *
- * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
- *	    otherwise windows will not boot (blue screen of death)!  We cannot
- *	    check for this here as we do not know which inode's $Bitmap is
- *	    being asked about so the caller needs to special case this.
- */
-int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
-{
-	if (type == AT_INDEX_ALLOCATION)
-		return -EPERM;
-	return 0;
-}
-
-/**
- * ntfs_attr_record_resize - resize an attribute record
- * @m:		mft record containing attribute record
- * @a:		attribute record to resize
- * @new_size:	new size in bytes to which to resize the attribute record @a
- *
- * Resize the attribute record @a, i.e. the resident part of the attribute, in
- * the mft record @m to @new_size bytes.
- *
- * Return 0 on success and -errno on error.  The following error codes are
- * defined:
- *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
- *
- * Note: On error, no modifications have been performed whatsoever.
- *
- * Warning: If you make a record smaller without having copied all the data you
- *	    are interested in the data may be overwritten.
- */
-int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
-{
-	ntfs_debug("Entering for new_size %u.", new_size);
-	/* Align to 8 bytes if it is not already done. */
-	if (new_size & 7)
-		new_size = (new_size + 7) & ~7;
-	/* If the actual attribute length has changed, move things around. */
-	if (new_size != le32_to_cpu(a->length)) {
-		u32 new_muse = le32_to_cpu(m->bytes_in_use) -
-				le32_to_cpu(a->length) + new_size;
-		/* Not enough space in this mft record. */
-		if (new_muse > le32_to_cpu(m->bytes_allocated))
-			return -ENOSPC;
-		/* Move attributes following @a to their new location. */
-		memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
-				le32_to_cpu(m->bytes_in_use) - ((u8*)a -
-				(u8*)m) - le32_to_cpu(a->length));
-		/* Adjust @m to reflect the change in used space. */
-		m->bytes_in_use = cpu_to_le32(new_muse);
-		/* Adjust @a to reflect the new size. */
-		if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
-			a->length = cpu_to_le32(new_size);
-	}
-	return 0;
-}
-
-/**
- * ntfs_resident_attr_value_resize - resize the value of a resident attribute
- * @m:		mft record containing attribute record
- * @a:		attribute record whose value to resize
- * @new_size:	new size in bytes to which to resize the attribute value of @a
- *
- * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
- * If the value is made bigger, the newly allocated space is cleared.
- *
- * Return 0 on success and -errno on error.  The following error codes are
- * defined:
- *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
- *
- * Note: On error, no modifications have been performed whatsoever.
- *
- * Warning: If you make a record smaller without having copied all the data you
- *	    are interested in the data may be overwritten.
- */
-int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
-		const u32 new_size)
-{
-	u32 old_size;
-
-	/* Resize the resident part of the attribute record. */
-	if (ntfs_attr_record_resize(m, a,
-			le16_to_cpu(a->data.resident.value_offset) + new_size))
-		return -ENOSPC;
-	/*
-	 * The resize succeeded!  If we made the attribute value bigger, clear
-	 * the area between the old size and @new_size.
-	 */
-	old_size = le32_to_cpu(a->data.resident.value_length);
-	if (new_size > old_size)
-		memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
-				old_size, 0, new_size - old_size);
-	/* Finally update the length of the attribute value. */
-	a->data.resident.value_length = cpu_to_le32(new_size);
-	return 0;
-}
-
-/**
- * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
- * @ni:		ntfs inode describing the attribute to convert
- * @data_size:	size of the resident data to copy to the non-resident attribute
- *
- * Convert the resident ntfs attribute described by the ntfs inode @ni to a
- * non-resident one.
- *
- * @data_size must be equal to the attribute value size.  This is needed since
- * we need to know the size before we can map the mft record and our callers
- * always know it.  The reason we cannot simply read the size from the vfs
- * inode i_size is that this is not necessarily uptodate.  This happens when
- * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
- *
- * Return 0 on success and -errno on error.  The following error return codes
- * are defined:
- *	-EPERM	- The attribute is not allowed to be non-resident.
- *	-ENOMEM	- Not enough memory.
- *	-ENOSPC	- Not enough disk space.
- *	-EINVAL	- Attribute not defined on the volume.
- *	-EIO	- I/o error or other error.
- * Note that -ENOSPC is also returned in the case that there is not enough
- * space in the mft record to do the conversion.  This can happen when the mft
- * record is already very full.  The caller is responsible for trying to make
- * space in the mft record and trying again.  FIXME: Do we need a separate
- * error return code for this kind of -ENOSPC or is it always worth trying
- * again in case the attribute may then fit in a resident state so no need to
- * make it non-resident at all?  Ho-hum...  (AIA)
- *
- * NOTE to self: No changes in the attribute list are required to move from
- *		 a resident to a non-resident attribute.
- *
- * Locking: - The caller must hold i_mutex on the inode.
- */
-int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
-{
-	s64 new_size;
-	struct inode *vi = VFS_I(ni);
-	ntfs_volume *vol = ni->vol;
-	ntfs_inode *base_ni;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	struct page *page;
-	runlist_element *rl;
-	u8 *kaddr;
-	unsigned long flags;
-	int mp_size, mp_ofs, name_ofs, arec_size, err, err2;
-	u32 attr_size;
-	u8 old_res_attr_flags;
-
-	/* Check that the attribute is allowed to be non-resident. */
-	err = ntfs_attr_can_be_non_resident(vol, ni->type);
-	if (unlikely(err)) {
-		if (err == -EPERM)
-			ntfs_debug("Attribute is not allowed to be "
-					"non-resident.");
-		else
-			ntfs_debug("Attribute not defined on the NTFS "
-					"volume!");
-		return err;
-	}
-	/*
-	 * FIXME: Compressed and encrypted attributes are not supported when
-	 * writing and we should never have gotten here for them.
-	 */
-	BUG_ON(NInoCompressed(ni));
-	BUG_ON(NInoEncrypted(ni));
-	/*
-	 * The size needs to be aligned to a cluster boundary for allocation
-	 * purposes.
-	 */
-	new_size = (data_size + vol->cluster_size - 1) &
-			~(vol->cluster_size - 1);
-	if (new_size > 0) {
-		/*
-		 * Will need the page later and since the page lock nests
-		 * outside all ntfs locks, we need to get the page now.
-		 */
-		page = find_or_create_page(vi->i_mapping, 0,
-				mapping_gfp_mask(vi->i_mapping));
-		if (unlikely(!page))
-			return -ENOMEM;
-		/* Start by allocating clusters to hold the attribute value. */
-		rl = ntfs_cluster_alloc(vol, 0, new_size >>
-				vol->cluster_size_bits, -1, DATA_ZONE, true);
-		if (IS_ERR(rl)) {
-			err = PTR_ERR(rl);
-			ntfs_debug("Failed to allocate cluster%s, error code "
-					"%i.", (new_size >>
-					vol->cluster_size_bits) > 1 ? "s" : "",
-					err);
-			goto page_err_out;
-		}
-	} else {
-		rl = NULL;
-		page = NULL;
-	}
-	/* Determine the size of the mapping pairs array. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1);
-	if (unlikely(mp_size < 0)) {
-		err = mp_size;
-		ntfs_debug("Failed to get size for mapping pairs array, error "
-				"code %i.", err);
-		goto rl_err_out;
-	}
-	down_write(&ni->runlist.lock);
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	BUG_ON(NInoNonResident(ni));
-	BUG_ON(a->non_resident);
-	/*
-	 * Calculate new offsets for the name and the mapping pairs array.
-	 */
-	if (NInoSparse(ni) || NInoCompressed(ni))
-		name_ofs = (offsetof(ATTR_REC,
-				data.non_resident.compressed_size) +
-				sizeof(a->data.non_resident.compressed_size) +
-				7) & ~7;
-	else
-		name_ofs = (offsetof(ATTR_REC,
-				data.non_resident.compressed_size) + 7) & ~7;
-	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
-	/*
-	 * Determine the size of the resident part of the now non-resident
-	 * attribute record.
-	 */
-	arec_size = (mp_ofs + mp_size + 7) & ~7;
-	/*
-	 * If the page is not uptodate bring it uptodate by copying from the
-	 * attribute value.
-	 */
-	attr_size = le32_to_cpu(a->data.resident.value_length);
-	BUG_ON(attr_size != data_size);
-	if (page && !PageUptodate(page)) {
-		kaddr = kmap_atomic(page);
-		memcpy(kaddr, (u8*)a +
-				le16_to_cpu(a->data.resident.value_offset),
-				attr_size);
-		memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size);
-		kunmap_atomic(kaddr);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-	}
-	/* Backup the attribute flag. */
-	old_res_attr_flags = a->data.resident.flags;
-	/* Resize the resident part of the attribute record. */
-	err = ntfs_attr_record_resize(m, a, arec_size);
-	if (unlikely(err))
-		goto err_out;
-	/*
-	 * Convert the resident part of the attribute record to describe a
-	 * non-resident attribute.
-	 */
-	a->non_resident = 1;
-	/* Move the attribute name if it exists and update the offset. */
-	if (a->name_length)
-		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
-				a->name_length * sizeof(ntfschar));
-	a->name_offset = cpu_to_le16(name_ofs);
-	/* Setup the fields specific to non-resident attributes. */
-	a->data.non_resident.lowest_vcn = 0;
-	a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
-			vol->cluster_size_bits);
-	a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
-	memset(&a->data.non_resident.reserved, 0,
-			sizeof(a->data.non_resident.reserved));
-	a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
-	a->data.non_resident.data_size =
-			a->data.non_resident.initialized_size =
-			cpu_to_sle64(attr_size);
-	if (NInoSparse(ni) || NInoCompressed(ni)) {
-		a->data.non_resident.compression_unit = 0;
-		if (NInoCompressed(ni) || vol->major_ver < 3)
-			a->data.non_resident.compression_unit = 4;
-		a->data.non_resident.compressed_size =
-				a->data.non_resident.allocated_size;
-	} else
-		a->data.non_resident.compression_unit = 0;
-	/* Generate the mapping pairs array into the attribute record. */
-	err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
-			arec_size - mp_ofs, rl, 0, -1, NULL);
-	if (unlikely(err)) {
-		ntfs_debug("Failed to build mapping pairs, error code %i.",
-				err);
-		goto undo_err_out;
-	}
-	/* Setup the in-memory attribute structure to be non-resident. */
-	ni->runlist.rl = rl;
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->allocated_size = new_size;
-	if (NInoSparse(ni) || NInoCompressed(ni)) {
-		ni->itype.compressed.size = ni->allocated_size;
-		if (a->data.non_resident.compression_unit) {
-			ni->itype.compressed.block_size = 1U << (a->data.
-					non_resident.compression_unit +
-					vol->cluster_size_bits);
-			ni->itype.compressed.block_size_bits =
-					ffs(ni->itype.compressed.block_size) -
-					1;
-			ni->itype.compressed.block_clusters = 1U <<
-					a->data.non_resident.compression_unit;
-		} else {
-			ni->itype.compressed.block_size = 0;
-			ni->itype.compressed.block_size_bits = 0;
-			ni->itype.compressed.block_clusters = 0;
-		}
-		vi->i_blocks = ni->itype.compressed.size >> 9;
-	} else
-		vi->i_blocks = ni->allocated_size >> 9;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * This needs to be last since the address space operations ->read_folio
-	 * and ->writepage can run concurrently with us as they are not
-	 * serialized on i_mutex.  Note, we are not allowed to fail once we flip
-	 * this switch, which is another reason to do this last.
-	 */
-	NInoSetNonResident(ni);
-	/* Mark the mft record dirty, so it gets written back. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-	if (page) {
-		set_page_dirty(page);
-		unlock_page(page);
-		put_page(page);
-	}
-	ntfs_debug("Done.");
-	return 0;
-undo_err_out:
-	/* Convert the attribute back into a resident attribute. */
-	a->non_resident = 0;
-	/* Move the attribute name if it exists and update the offset. */
-	name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) +
-			sizeof(a->data.resident.reserved) + 7) & ~7;
-	if (a->name_length)
-		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
-				a->name_length * sizeof(ntfschar));
-	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
-	a->name_offset = cpu_to_le16(name_ofs);
-	arec_size = (mp_ofs + attr_size + 7) & ~7;
-	/* Resize the resident part of the attribute record. */
-	err2 = ntfs_attr_record_resize(m, a, arec_size);
-	if (unlikely(err2)) {
-		/*
-		 * This cannot happen (well if memory corruption is at work it
-		 * could happen in theory), but deal with it as well as we can.
-		 * If the old size is too small, truncate the attribute,
-		 * otherwise simply give it a larger allocated size.
-		 * FIXME: Should check whether chkdsk complains when the
-		 * allocated size is much bigger than the resident value size.
-		 */
-		arec_size = le32_to_cpu(a->length);
-		if ((mp_ofs + attr_size) > arec_size) {
-			err2 = attr_size;
-			attr_size = arec_size - mp_ofs;
-			ntfs_error(vol->sb, "Failed to undo partial resident "
-					"to non-resident attribute "
-					"conversion.  Truncating inode 0x%lx, "
-					"attribute type 0x%x from %i bytes to "
-					"%i bytes to maintain metadata "
-					"consistency.  THIS MEANS YOU ARE "
-					"LOSING %i BYTES DATA FROM THIS %s.",
-					vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type),
-					err2, attr_size, err2 - attr_size,
-					((ni->type == AT_DATA) &&
-					!ni->name_len) ? "FILE": "ATTRIBUTE");
-			write_lock_irqsave(&ni->size_lock, flags);
-			ni->initialized_size = attr_size;
-			i_size_write(vi, attr_size);
-			write_unlock_irqrestore(&ni->size_lock, flags);
-		}
-	}
-	/* Setup the fields specific to resident attributes. */
-	a->data.resident.value_length = cpu_to_le32(attr_size);
-	a->data.resident.value_offset = cpu_to_le16(mp_ofs);
-	a->data.resident.flags = old_res_attr_flags;
-	memset(&a->data.resident.reserved, 0,
-			sizeof(a->data.resident.reserved));
-	/* Copy the data from the page back to the attribute value. */
-	if (page) {
-		kaddr = kmap_atomic(page);
-		memcpy((u8*)a + mp_ofs, kaddr, attr_size);
-		kunmap_atomic(kaddr);
-	}
-	/* Setup the allocated size in the ntfs inode in case it changed. */
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->allocated_size = arec_size - mp_ofs;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	/* Mark the mft record dirty, so it gets written back. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	ni->runlist.rl = NULL;
-	up_write(&ni->runlist.lock);
-rl_err_out:
-	if (rl) {
-		if (ntfs_cluster_free_from_rl(vol, rl) < 0) {
-			ntfs_error(vol->sb, "Failed to release allocated "
-					"cluster(s) in error code path.  Run "
-					"chkdsk to recover the lost "
-					"cluster(s).");
-			NVolSetErrors(vol);
-		}
-		ntfs_free(rl);
-page_err_out:
-		unlock_page(page);
-		put_page(page);
-	}
-	if (err == -EINVAL)
-		err = -EIO;
-	return err;
-}
-
-/**
- * ntfs_attr_extend_allocation - extend the allocated space of an attribute
- * @ni:			ntfs inode of the attribute whose allocation to extend
- * @new_alloc_size:	new size in bytes to which to extend the allocation to
- * @new_data_size:	new size in bytes to which to extend the data to
- * @data_start:		beginning of region which is required to be non-sparse
- *
- * Extend the allocated space of an attribute described by the ntfs inode @ni
- * to @new_alloc_size bytes.  If @data_start is -1, the whole extension may be
- * implemented as a hole in the file (as long as both the volume and the ntfs
- * inode @ni have sparse support enabled).  If @data_start is >= 0, then the
- * region between the old allocated size and @data_start - 1 may be made sparse
- * but the regions between @data_start and @new_alloc_size must be backed by
- * actual clusters.
- *
- * If @new_data_size is -1, it is ignored.  If it is >= 0, then the data size
- * of the attribute is extended to @new_data_size.  Note that the i_size of the
- * vfs inode is not updated.  Only the data size in the base attribute record
- * is updated.  The caller has to update i_size separately if this is required.
- * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
- * size as well as for @new_data_size to be greater than @new_alloc_size.
- *
- * For resident attributes this involves resizing the attribute record and if
- * necessary moving it and/or other attributes into extent mft records and/or
- * converting the attribute to a non-resident attribute which in turn involves
- * extending the allocation of a non-resident attribute as described below.
- *
- * For non-resident attributes this involves allocating clusters in the data
- * zone on the volume (except for regions that are being made sparse) and
- * extending the run list to describe the allocated clusters as well as
- * updating the mapping pairs array of the attribute.  This in turn involves
- * resizing the attribute record and if necessary moving it and/or other
- * attributes into extent mft records and/or splitting the attribute record
- * into multiple extent attribute records.
- *
- * Also, the attribute list attribute is updated if present and in some of the
- * above cases (the ones where extent mft records/attributes come into play),
- * an attribute list attribute is created if not already present.
- *
- * Return the new allocated size on success and -errno on error.  In the case
- * that an error is encountered but a partial extension at least up to
- * @data_start (if present) is possible, the allocation is partially extended
- * and this is returned.  This means the caller must check the returned size to
- * determine if the extension was partial.  If @data_start is -1 then partial
- * allocations are not performed.
- *
- * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
- *
- * Locking: This function takes the runlist lock of @ni for writing as well as
- * locking the mft record of the base ntfs inode.  These locks are maintained
- * throughout execution of the function.  These locks are required so that the
- * attribute can be resized safely and so that it can for example be converted
- * from resident to non-resident safely.
- *
- * TODO: At present attribute list attribute handling is not implemented.
- *
- * TODO: At present it is not safe to call this function for anything other
- * than the $DATA attribute(s) of an uncompressed and unencrypted file.
- */
-s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
-		const s64 new_data_size, const s64 data_start)
-{
-	VCN vcn;
-	s64 ll, allocated_size, start = data_start;
-	struct inode *vi = VFS_I(ni);
-	ntfs_volume *vol = ni->vol;
-	ntfs_inode *base_ni;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	runlist_element *rl, *rl2;
-	unsigned long flags;
-	int err, mp_size;
-	u32 attr_len = 0; /* Silence stupid gcc warning. */
-	bool mp_rebuilt;
-
-#ifdef DEBUG
-	read_lock_irqsave(&ni->size_lock, flags);
-	allocated_size = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
-			"old_allocated_size 0x%llx, "
-			"new_allocated_size 0x%llx, new_data_size 0x%llx, "
-			"data_start 0x%llx.", vi->i_ino,
-			(unsigned)le32_to_cpu(ni->type),
-			(unsigned long long)allocated_size,
-			(unsigned long long)new_alloc_size,
-			(unsigned long long)new_data_size,
-			(unsigned long long)start);
-#endif
-retry_extend:
-	/*
-	 * For non-resident attributes, @start and @new_size need to be aligned
-	 * to cluster boundaries for allocation purposes.
-	 */
-	if (NInoNonResident(ni)) {
-		if (start > 0)
-			start &= ~(s64)vol->cluster_size_mask;
-		new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
-				~(s64)vol->cluster_size_mask;
-	}
-	BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
-	/* Check if new size is allowed in $AttrDef. */
-	err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
-	if (unlikely(err)) {
-		/* Only emit errors when the write will fail completely. */
-		read_lock_irqsave(&ni->size_lock, flags);
-		allocated_size = ni->allocated_size;
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (start < 0 || start >= allocated_size) {
-			if (err == -ERANGE) {
-				ntfs_error(vol->sb, "Cannot extend allocation "
-						"of inode 0x%lx, attribute "
-						"type 0x%x, because the new "
-						"allocation would exceed the "
-						"maximum allowed size for "
-						"this attribute type.",
-						vi->i_ino, (unsigned)
-						le32_to_cpu(ni->type));
-			} else {
-				ntfs_error(vol->sb, "Cannot extend allocation "
-						"of inode 0x%lx, attribute "
-						"type 0x%x, because this "
-						"attribute type is not "
-						"defined on the NTFS volume.  "
-						"Possible corruption!  You "
-						"should run chkdsk!",
-						vi->i_ino, (unsigned)
-						le32_to_cpu(ni->type));
-			}
-		}
-		/* Translate error code to be POSIX conformant for write(2). */
-		if (err == -ERANGE)
-			err = -EFBIG;
-		else
-			err = -EIO;
-		return err;
-	}
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/*
-	 * We will be modifying both the runlist (if non-resident) and the mft
-	 * record so lock them both down.
-	 */
-	down_write(&ni->runlist.lock);
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	read_lock_irqsave(&ni->size_lock, flags);
-	allocated_size = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * If non-resident, seek to the last extent.  If resident, there is
-	 * only one extent, so seek to that.
-	 */
-	vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
-			0;
-	/*
-	 * Abort if someone did the work whilst we waited for the locks.  If we
-	 * just converted the attribute from resident to non-resident it is
-	 * likely that exactly this has happened already.  We cannot quite
-	 * abort if we need to update the data size.
-	 */
-	if (unlikely(new_alloc_size <= allocated_size)) {
-		ntfs_debug("Allocated size already exceeds requested size.");
-		new_alloc_size = allocated_size;
-		if (new_data_size < 0)
-			goto done;
-		/*
-		 * We want the first attribute extent so that we can update the
-		 * data size.
-		 */
-		vcn = 0;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, vcn, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	/* Use goto to reduce indentation. */
-	if (a->non_resident)
-		goto do_non_resident_extend;
-	BUG_ON(NInoNonResident(ni));
-	/* The total length of the attribute value. */
-	attr_len = le32_to_cpu(a->data.resident.value_length);
-	/*
-	 * Extend the attribute record to be able to store the new attribute
-	 * size.  ntfs_attr_record_resize() will not do anything if the size is
-	 * not changing.
-	 */
-	if (new_alloc_size < vol->mft_record_size &&
-			!ntfs_attr_record_resize(m, a,
-			le16_to_cpu(a->data.resident.value_offset) +
-			new_alloc_size)) {
-		/* The resize succeeded! */
-		write_lock_irqsave(&ni->size_lock, flags);
-		ni->allocated_size = le32_to_cpu(a->length) -
-				le16_to_cpu(a->data.resident.value_offset);
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		if (new_data_size >= 0) {
-			BUG_ON(new_data_size < attr_len);
-			a->data.resident.value_length =
-					cpu_to_le32((u32)new_data_size);
-		}
-		goto flush_done;
-	}
-	/*
-	 * We have to drop all the locks so we can call
-	 * ntfs_attr_make_non_resident().  This could be optimised by try-
-	 * locking the first page cache page and only if that fails dropping
-	 * the locks, locking the page, and redoing all the locking and
-	 * lookups.  While this would be a huge optimisation, it is not worth
-	 * it as this is definitely a slow code path.
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-	/*
-	 * Not enough space in the mft record, try to make the attribute
-	 * non-resident and if successful restart the extension process.
-	 */
-	err = ntfs_attr_make_non_resident(ni, attr_len);
-	if (likely(!err))
-		goto retry_extend;
-	/*
-	 * Could not make non-resident.  If this is due to this not being
-	 * permitted for this attribute type or there not being enough space,
-	 * try to make other attributes non-resident.  Otherwise fail.
-	 */
-	if (unlikely(err != -EPERM && err != -ENOSPC)) {
-		/* Only emit errors when the write will fail completely. */
-		read_lock_irqsave(&ni->size_lock, flags);
-		allocated_size = ni->allocated_size;
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because the conversion from resident "
-					"to non-resident attribute failed "
-					"with error code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		if (err != -ENOMEM)
-			err = -EIO;
-		goto conv_err_out;
-	}
-	/* TODO: Not implemented from here, abort. */
-	read_lock_irqsave(&ni->size_lock, flags);
-	allocated_size = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (start < 0 || start >= allocated_size) {
-		if (err == -ENOSPC)
-			ntfs_error(vol->sb, "Not enough space in the mft "
-					"record/on disk for the non-resident "
-					"attribute value.  This case is not "
-					"implemented yet.");
-		else /* if (err == -EPERM) */
-			ntfs_error(vol->sb, "This attribute type may not be "
-					"non-resident.  This case is not "
-					"implemented yet.");
-	}
-	err = -EOPNOTSUPP;
-	goto conv_err_out;
-#if 0
-	// TODO: Attempt to make other attributes non-resident.
-	if (!err)
-		goto do_resident_extend;
-	/*
-	 * Both the attribute list attribute and the standard information
-	 * attribute must remain in the base inode.  Thus, if this is one of
-	 * these attributes, we have to try to move other attributes out into
-	 * extent mft records instead.
-	 */
-	if (ni->type == AT_ATTRIBUTE_LIST ||
-			ni->type == AT_STANDARD_INFORMATION) {
-		// TODO: Attempt to move other attributes into extent mft
-		// records.
-		err = -EOPNOTSUPP;
-		if (!err)
-			goto do_resident_extend;
-		goto err_out;
-	}
-	// TODO: Attempt to move this attribute to an extent mft record, but
-	// only if it is not already the only attribute in an mft record in
-	// which case there would be nothing to gain.
-	err = -EOPNOTSUPP;
-	if (!err)
-		goto do_resident_extend;
-	/* There is nothing we can do to make enough space. )-: */
-	goto err_out;
-#endif
-do_non_resident_extend:
-	BUG_ON(!NInoNonResident(ni));
-	if (new_alloc_size == allocated_size) {
-		BUG_ON(vcn);
-		goto alloc_done;
-	}
-	/*
-	 * If the data starts after the end of the old allocation, this is a
-	 * $DATA attribute and sparse attributes are enabled on the volume and
-	 * for this inode, then create a sparse region between the old
-	 * allocated size and the start of the data.  Otherwise simply proceed
-	 * with filling the whole space between the old allocated size and the
-	 * new allocated size with clusters.
-	 */
-	if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
-			!NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
-		goto skip_sparse;
-	// TODO: This is not implemented yet.  We just fill in with real
-	// clusters for now...
-	ntfs_debug("Inserting holes is not-implemented yet.  Falling back to "
-			"allocating real clusters instead.");
-skip_sparse:
-	rl = ni->runlist.rl;
-	if (likely(rl)) {
-		/* Seek to the end of the runlist. */
-		while (rl->length)
-			rl++;
-	}
-	/* If this attribute extent is not mapped, map it now. */
-	if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
-			(rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
-			(rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
-		if (!rl && !allocated_size)
-			goto first_alloc;
-		rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
-		if (IS_ERR(rl)) {
-			err = PTR_ERR(rl);
-			if (start < 0 || start >= allocated_size)
-				ntfs_error(vol->sb, "Cannot extend allocation "
-						"of inode 0x%lx, attribute "
-						"type 0x%x, because the "
-						"mapping of a runlist "
-						"fragment failed with error "
-						"code %i.", vi->i_ino,
-						(unsigned)le32_to_cpu(ni->type),
-						err);
-			if (err != -ENOMEM)
-				err = -EIO;
-			goto err_out;
-		}
-		ni->runlist.rl = rl;
-		/* Seek to the end of the runlist. */
-		while (rl->length)
-			rl++;
-	}
-	/*
-	 * We now know the runlist of the last extent is mapped and @rl is at
-	 * the end of the runlist.  We want to begin allocating clusters
-	 * starting at the last allocated cluster to reduce fragmentation.  If
-	 * there are no valid LCNs in the attribute we let the cluster
-	 * allocator choose the starting cluster.
-	 */
-	/* If the last LCN is a hole or simillar seek back to last real LCN. */
-	while (rl->lcn < 0 && rl > ni->runlist.rl)
-		rl--;
-first_alloc:
-	// FIXME: Need to implement partial allocations so at least part of the
-	// write can be performed when start >= 0.  (Needed for POSIX write(2)
-	// conformance.)
-	rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
-			(new_alloc_size - allocated_size) >>
-			vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
-			rl->lcn + rl->length : -1, DATA_ZONE, true);
-	if (IS_ERR(rl2)) {
-		err = PTR_ERR(rl2);
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because the allocation of clusters "
-					"failed with error code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		if (err != -ENOMEM && err != -ENOSPC)
-			err = -EIO;
-		goto err_out;
-	}
-	rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
-	if (IS_ERR(rl)) {
-		err = PTR_ERR(rl);
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because the runlist merge failed "
-					"with error code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		if (err != -ENOMEM)
-			err = -EIO;
-		if (ntfs_cluster_free_from_rl(vol, rl2)) {
-			ntfs_error(vol->sb, "Failed to release allocated "
-					"cluster(s) in error code path.  Run "
-					"chkdsk to recover the lost "
-					"cluster(s).");
-			NVolSetErrors(vol);
-		}
-		ntfs_free(rl2);
-		goto err_out;
-	}
-	ni->runlist.rl = rl;
-	ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
-			allocated_size) >> vol->cluster_size_bits);
-	/* Find the runlist element with which the attribute extent starts. */
-	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
-	rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
-	BUG_ON(!rl2);
-	BUG_ON(!rl2->length);
-	BUG_ON(rl2->lcn < LCN_HOLE);
-	mp_rebuilt = false;
-	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
-	if (unlikely(mp_size <= 0)) {
-		err = mp_size;
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because determining the size for the "
-					"mapping pairs failed with error code "
-					"%i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		err = -EIO;
-		goto undo_alloc;
-	}
-	/* Extend the attribute record to fit the bigger mapping pairs array. */
-	attr_len = le32_to_cpu(a->length);
-	err = ntfs_attr_record_resize(m, a, mp_size +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
-	if (unlikely(err)) {
-		BUG_ON(err != -ENOSPC);
-		// TODO: Deal with this by moving this extent to a new mft
-		// record or by starting a new extent in a new mft record,
-		// possibly by extending this extent partially and filling it
-		// and creating a new extent for the remainder, or by making
-		// other attributes non-resident and/or by moving other
-		// attributes out of this mft record.
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Not enough space in the mft "
-					"record for the extended attribute "
-					"record.  This case is not "
-					"implemented yet.");
-		err = -EOPNOTSUPP;
-		goto undo_alloc;
-	}
-	mp_rebuilt = true;
-	/* Generate the mapping pairs array directly into the attr record. */
-	err = ntfs_mapping_pairs_build(vol, (u8*)a +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, -1, NULL);
-	if (unlikely(err)) {
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because building the mapping pairs "
-					"failed with error code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		err = -EIO;
-		goto undo_alloc;
-	}
-	/* Update the highest_vcn. */
-	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
-			vol->cluster_size_bits) - 1);
-	/*
-	 * We now have extended the allocated size of the attribute.  Reflect
-	 * this in the ntfs_inode structure and the attribute record.
-	 */
-	if (a->data.non_resident.lowest_vcn) {
-		/*
-		 * We are not in the first attribute extent, switch to it, but
-		 * first ensure the changes will make it to disk later.
-		 */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		ntfs_attr_reinit_search_ctx(ctx);
-		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, 0, NULL, 0, ctx);
-		if (unlikely(err))
-			goto restore_undo_alloc;
-		/* @m is not used any more so no need to set it. */
-		a = ctx->attr;
-	}
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->allocated_size = new_alloc_size;
-	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
-	/*
-	 * FIXME: This would fail if @ni is a directory, $MFT, or an index,
-	 * since those can have sparse/compressed set.  For example can be
-	 * set compressed even though it is not compressed itself and in that
-	 * case the bit means that files are to be created compressed in the
-	 * directory...  At present this is ok as this code is only called for
-	 * regular files, and only for their $DATA attribute(s).
-	 * FIXME: The calculation is wrong if we created a hole above.  For now
-	 * it does not matter as we never create holes.
-	 */
-	if (NInoSparse(ni) || NInoCompressed(ni)) {
-		ni->itype.compressed.size += new_alloc_size - allocated_size;
-		a->data.non_resident.compressed_size =
-				cpu_to_sle64(ni->itype.compressed.size);
-		vi->i_blocks = ni->itype.compressed.size >> 9;
-	} else
-		vi->i_blocks = new_alloc_size >> 9;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-alloc_done:
-	if (new_data_size >= 0) {
-		BUG_ON(new_data_size <
-				sle64_to_cpu(a->data.non_resident.data_size));
-		a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
-	}
-flush_done:
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-done:
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-	ntfs_debug("Done, new_allocated_size 0x%llx.",
-			(unsigned long long)new_alloc_size);
-	return new_alloc_size;
-restore_undo_alloc:
-	if (start < 0 || start >= allocated_size)
-		ntfs_error(vol->sb, "Cannot complete extension of allocation "
-				"of inode 0x%lx, attribute type 0x%x, because "
-				"lookup of first attribute extent failed with "
-				"error code %i.", vi->i_ino,
-				(unsigned)le32_to_cpu(ni->type), err);
-	if (err == -ENOENT)
-		err = -EIO;
-	ntfs_attr_reinit_search_ctx(ctx);
-	if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
-			allocated_size >> vol->cluster_size_bits, NULL, 0,
-			ctx)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"attribute in error code path.  Run chkdsk to "
-				"recover.");
-		write_lock_irqsave(&ni->size_lock, flags);
-		ni->allocated_size = new_alloc_size;
-		/*
-		 * FIXME: This would fail if @ni is a directory...  See above.
-		 * FIXME: The calculation is wrong if we created a hole above.
-		 * For now it does not matter as we never create holes.
-		 */
-		if (NInoSparse(ni) || NInoCompressed(ni)) {
-			ni->itype.compressed.size += new_alloc_size -
-					allocated_size;
-			vi->i_blocks = ni->itype.compressed.size >> 9;
-		} else
-			vi->i_blocks = new_alloc_size >> 9;
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-		up_write(&ni->runlist.lock);
-		/*
-		 * The only thing that is now wrong is the allocated size of the
-		 * base attribute extent which chkdsk should be able to fix.
-		 */
-		NVolSetErrors(vol);
-		return err;
-	}
-	ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
-			(allocated_size >> vol->cluster_size_bits) - 1);
-undo_alloc:
-	ll = allocated_size >> vol->cluster_size_bits;
-	if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
-		ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
-				"in error code path.  Run chkdsk to recover "
-				"the lost cluster(s).");
-		NVolSetErrors(vol);
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	/*
-	 * If the runlist truncation fails and/or the search context is no
-	 * longer valid, we cannot resize the attribute record or build the
-	 * mapping pairs array thus we mark the inode bad so that no access to
-	 * the freed clusters can happen.
-	 */
-	if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
-		ntfs_error(vol->sb, "Failed to %s in error code path.  Run "
-				"chkdsk to recover.", IS_ERR(m) ?
-				"restore attribute search context" :
-				"truncate attribute runlist");
-		NVolSetErrors(vol);
-	} else if (mp_rebuilt) {
-		if (ntfs_attr_record_resize(m, a, attr_len)) {
-			ntfs_error(vol->sb, "Failed to restore attribute "
-					"record in error code path.  Run "
-					"chkdsk to recover.");
-			NVolSetErrors(vol);
-		} else /* if (success) */ {
-			if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
-					a->data.non_resident.
-					mapping_pairs_offset), attr_len -
-					le16_to_cpu(a->data.non_resident.
-					mapping_pairs_offset), rl2, ll, -1,
-					NULL)) {
-				ntfs_error(vol->sb, "Failed to restore "
-						"mapping pairs array in error "
-						"code path.  Run chkdsk to "
-						"recover.");
-				NVolSetErrors(vol);
-			}
-			flush_dcache_mft_record_page(ctx->ntfs_ino);
-			mark_mft_record_dirty(ctx->ntfs_ino);
-		}
-	}
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-conv_err_out:
-	ntfs_debug("Failed.  Returning error code %i.", err);
-	return err;
-}
-
-/**
- * ntfs_attr_set - fill (a part of) an attribute with a byte
- * @ni:		ntfs inode describing the attribute to fill
- * @ofs:	offset inside the attribute at which to start to fill
- * @cnt:	number of bytes to fill
- * @val:	the unsigned 8-bit value with which to fill the attribute
- *
- * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
- * byte offset @ofs inside the attribute with the constant byte @val.
- *
- * This function is effectively like memset() applied to an ntfs attribute.
- * Note this function actually only operates on the page cache pages belonging
- * to the ntfs attribute and it marks them dirty after doing the memset().
- * Thus it relies on the vm dirty page write code paths to cause the modified
- * pages to be written to the mft record/disk.
- *
- * Return 0 on success and -errno on error.  An error code of -ESPIPE means
- * that @ofs + @cnt were outside the end of the attribute and no write was
- * performed.
- */
-int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
-{
-	ntfs_volume *vol = ni->vol;
-	struct address_space *mapping;
-	struct page *page;
-	u8 *kaddr;
-	pgoff_t idx, end;
-	unsigned start_ofs, end_ofs, size;
-
-	ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
-			(long long)ofs, (long long)cnt, val);
-	BUG_ON(ofs < 0);
-	BUG_ON(cnt < 0);
-	if (!cnt)
-		goto done;
-	/*
-	 * FIXME: Compressed and encrypted attributes are not supported when
-	 * writing and we should never have gotten here for them.
-	 */
-	BUG_ON(NInoCompressed(ni));
-	BUG_ON(NInoEncrypted(ni));
-	mapping = VFS_I(ni)->i_mapping;
-	/* Work out the starting index and page offset. */
-	idx = ofs >> PAGE_SHIFT;
-	start_ofs = ofs & ~PAGE_MASK;
-	/* Work out the ending index and page offset. */
-	end = ofs + cnt;
-	end_ofs = end & ~PAGE_MASK;
-	/* If the end is outside the inode size return -ESPIPE. */
-	if (unlikely(end > i_size_read(VFS_I(ni)))) {
-		ntfs_error(vol->sb, "Request exceeds end of attribute.");
-		return -ESPIPE;
-	}
-	end >>= PAGE_SHIFT;
-	/* If there is a first partial page, need to do it the slow way. */
-	if (start_ofs) {
-		page = read_mapping_page(mapping, idx, NULL);
-		if (IS_ERR(page)) {
-			ntfs_error(vol->sb, "Failed to read first partial "
-					"page (error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		/*
-		 * If the last page is the same as the first page, need to
-		 * limit the write to the end offset.
-		 */
-		size = PAGE_SIZE;
-		if (idx == end)
-			size = end_ofs;
-		kaddr = kmap_atomic(page);
-		memset(kaddr + start_ofs, val, size - start_ofs);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr);
-		set_page_dirty(page);
-		put_page(page);
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-		if (idx == end)
-			goto done;
-		idx++;
-	}
-	/* Do the whole pages the fast way. */
-	for (; idx < end; idx++) {
-		/* Find or create the current page.  (The page is locked.) */
-		page = grab_cache_page(mapping, idx);
-		if (unlikely(!page)) {
-			ntfs_error(vol->sb, "Insufficient memory to grab "
-					"page (index 0x%lx).", idx);
-			return -ENOMEM;
-		}
-		kaddr = kmap_atomic(page);
-		memset(kaddr, val, PAGE_SIZE);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr);
-		/*
-		 * If the page has buffers, mark them uptodate since buffer
-		 * state and not page state is definitive in 2.6 kernels.
-		 */
-		if (page_has_buffers(page)) {
-			struct buffer_head *bh, *head;
-
-			bh = head = page_buffers(page);
-			do {
-				set_buffer_uptodate(bh);
-			} while ((bh = bh->b_this_page) != head);
-		}
-		/* Now that buffers are uptodate, set the page uptodate, too. */
-		SetPageUptodate(page);
-		/*
-		 * Set the page and all its buffers dirty and mark the inode
-		 * dirty, too.  The VM will write the page later on.
-		 */
-		set_page_dirty(page);
-		/* Finally unlock and release the page. */
-		unlock_page(page);
-		put_page(page);
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-	}
-	/* If there is a last partial page, need to do it the slow way. */
-	if (end_ofs) {
-		page = read_mapping_page(mapping, idx, NULL);
-		if (IS_ERR(page)) {
-			ntfs_error(vol->sb, "Failed to read last partial page "
-					"(error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		kaddr = kmap_atomic(page);
-		memset(kaddr, val, end_ofs);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr);
-		set_page_dirty(page);
-		put_page(page);
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-	}
-done:
-	ntfs_debug("Done.");
-	return 0;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
deleted file mode 100644
index fe0890d3d072..000000000000
--- a/fs/ntfs/attrib.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
- *	      Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_ATTRIB_H
-#define _LINUX_NTFS_ATTRIB_H
-
-#include "endian.h"
-#include "types.h"
-#include "layout.h"
-#include "inode.h"
-#include "runlist.h"
-#include "volume.h"
-
-/**
- * ntfs_attr_search_ctx - used in attribute search functions
- * @mrec:	buffer containing mft record to search
- * @attr:	attribute record in @mrec where to begin/continue search
- * @is_first:	if true ntfs_attr_lookup() begins search with @attr, else after
- *
- * Structure must be initialized to zero before the first call to one of the
- * attribute search functions. Initialize @mrec to point to the mft record to
- * search, and @attr to point to the first attribute within @mrec (not necessary
- * if calling the _first() functions), and set @is_first to 'true' (not necessary
- * if calling the _first() functions).
- *
- * If @is_first is 'true', the search begins with @attr. If @is_first is 'false',
- * the search begins after @attr. This is so that, after the first call to one
- * of the search attribute functions, we can call the function again, without
- * any modification of the search context, to automagically get the next
- * matching attribute.
- */
-typedef struct {
-	MFT_RECORD *mrec;
-	ATTR_RECORD *attr;
-	bool is_first;
-	ntfs_inode *ntfs_ino;
-	ATTR_LIST_ENTRY *al_entry;
-	ntfs_inode *base_ntfs_ino;
-	MFT_RECORD *base_mrec;
-	ATTR_RECORD *base_attr;
-} ntfs_attr_search_ctx;
-
-extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
-		ntfs_attr_search_ctx *ctx);
-extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
-
-extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
-		const bool write_locked);
-
-extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
-		const VCN vcn, ntfs_attr_search_ctx *ctx);
-
-int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
-		const u32 name_len, const IGNORE_CASE_BOOL ic,
-		const VCN lowest_vcn, const u8 *val, const u32 val_len,
-		ntfs_attr_search_ctx *ctx);
-
-extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start,
-		const s64 size, const s64 initialized_size);
-
-static inline s64 ntfs_attr_size(const ATTR_RECORD *a)
-{
-	if (!a->non_resident)
-		return (s64)le32_to_cpu(a->data.resident.value_length);
-	return sle64_to_cpu(a->data.non_resident.data_size);
-}
-
-extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx);
-extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
-		MFT_RECORD *mrec);
-extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
-
-#ifdef NTFS_RW
-
-extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
-		const ATTR_TYPE type, const s64 size);
-extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
-		const ATTR_TYPE type);
-extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
-		const ATTR_TYPE type);
-
-extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
-extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
-		const u32 new_size);
-
-extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
-
-extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
-		const s64 new_data_size, const s64 data_start);
-
-extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
-		const u8 val);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
deleted file mode 100644
index 0675b2400873..000000000000
--- a/fs/ntfs/bitmap.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * bitmap.c - NTFS kernel bitmap handling.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/pagemap.h>
-
-#include "bitmap.h"
-#include "debug.h"
-#include "aops.h"
-#include "ntfs.h"
-
-/**
- * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
- * @vi:			vfs inode describing the bitmap
- * @start_bit:		first bit to set
- * @count:		number of bits to set
- * @value:		value to set the bits to (i.e. 0 or 1)
- * @is_rollback:	if 'true' this is a rollback operation
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi to @value, where @value is either 0 or 1.
- *
- * @is_rollback should always be 'false', it is for internal use to rollback
- * errors.  You probably want to use ntfs_bitmap_set_bits_in_run() instead.
- *
- * Return 0 on success and -errno on error.
- */
-int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
-		const s64 count, const u8 value, const bool is_rollback)
-{
-	s64 cnt = count;
-	pgoff_t index, end_index;
-	struct address_space *mapping;
-	struct page *page;
-	u8 *kaddr;
-	int pos, len;
-	u8 bit;
-
-	BUG_ON(!vi);
-	ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, "
-			"value %u.%s", vi->i_ino, (unsigned long long)start_bit,
-			(unsigned long long)cnt, (unsigned int)value,
-			is_rollback ? " (rollback)" : "");
-	BUG_ON(start_bit < 0);
-	BUG_ON(cnt < 0);
-	BUG_ON(value > 1);
-	/*
-	 * Calculate the indices for the pages containing the first and last
-	 * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
-	 */
-	index = start_bit >> (3 + PAGE_SHIFT);
-	end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT);
-
-	/* Get the page containing the first bit (@start_bit). */
-	mapping = vi->i_mapping;
-	page = ntfs_map_page(mapping, index);
-	if (IS_ERR(page)) {
-		if (!is_rollback)
-			ntfs_error(vi->i_sb, "Failed to map first page (error "
-					"%li), aborting.", PTR_ERR(page));
-		return PTR_ERR(page);
-	}
-	kaddr = page_address(page);
-
-	/* Set @pos to the position of the byte containing @start_bit. */
-	pos = (start_bit >> 3) & ~PAGE_MASK;
-
-	/* Calculate the position of @start_bit in the first byte. */
-	bit = start_bit & 7;
-
-	/* If the first byte is partial, modify the appropriate bits in it. */
-	if (bit) {
-		u8 *byte = kaddr + pos;
-		while ((bit & 7) && cnt) {
-			cnt--;
-			if (value)
-				*byte |= 1 << bit++;
-			else
-				*byte &= ~(1 << bit++);
-		}
-		/* If we are done, unmap the page and return success. */
-		if (!cnt)
-			goto done;
-
-		/* Update @pos to the new position. */
-		pos++;
-	}
-	/*
-	 * Depending on @value, modify all remaining whole bytes in the page up
-	 * to @cnt.
-	 */
-	len = min_t(s64, cnt >> 3, PAGE_SIZE - pos);
-	memset(kaddr + pos, value ? 0xff : 0, len);
-	cnt -= len << 3;
-
-	/* Update @len to point to the first not-done byte in the page. */
-	if (cnt < 8)
-		len += pos;
-
-	/* If we are not in the last page, deal with all subsequent pages. */
-	while (index < end_index) {
-		BUG_ON(cnt <= 0);
-
-		/* Update @index and get the next page. */
-		flush_dcache_page(page);
-		set_page_dirty(page);
-		ntfs_unmap_page(page);
-		page = ntfs_map_page(mapping, ++index);
-		if (IS_ERR(page))
-			goto rollback;
-		kaddr = page_address(page);
-		/*
-		 * Depending on @value, modify all remaining whole bytes in the
-		 * page up to @cnt.
-		 */
-		len = min_t(s64, cnt >> 3, PAGE_SIZE);
-		memset(kaddr, value ? 0xff : 0, len);
-		cnt -= len << 3;
-	}
-	/*
-	 * The currently mapped page is the last one.  If the last byte is
-	 * partial, modify the appropriate bits in it.  Note, @len is the
-	 * position of the last byte inside the page.
-	 */
-	if (cnt) {
-		u8 *byte;
-
-		BUG_ON(cnt > 7);
-
-		bit = cnt;
-		byte = kaddr + len;
-		while (bit--) {
-			if (value)
-				*byte |= 1 << bit;
-			else
-				*byte &= ~(1 << bit);
-		}
-	}
-done:
-	/* We are done.  Unmap the page and return success. */
-	flush_dcache_page(page);
-	set_page_dirty(page);
-	ntfs_unmap_page(page);
-	ntfs_debug("Done.");
-	return 0;
-rollback:
-	/*
-	 * Current state:
-	 *	- no pages are mapped
-	 *	- @count - @cnt is the number of bits that have been modified
-	 */
-	if (is_rollback)
-		return PTR_ERR(page);
-	if (count != cnt)
-		pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
-				value ? 0 : 1, true);
-	else
-		pos = 0;
-	if (!pos) {
-		/* Rollback was successful. */
-		ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
-				"%li), aborting.", PTR_ERR(page));
-	} else {
-		/* Rollback failed. */
-		ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
-				"%li) and rollback failed (error %i).  "
-				"Aborting and leaving inconsistent metadata.  "
-				"Unmount and run chkdsk.", PTR_ERR(page), pos);
-		NVolSetErrors(NTFS_SB(vi->i_sb));
-	}
-	return PTR_ERR(page);
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
deleted file mode 100644
index 9dd2224ca9c4..000000000000
--- a/fs/ntfs/bitmap.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * bitmap.h - Defines for NTFS kernel bitmap handling.  Part of the Linux-NTFS
- *	      project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_BITMAP_H
-#define _LINUX_NTFS_BITMAP_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "types.h"
-
-extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
-		const s64 count, const u8 value, const bool is_rollback);
-
-/**
- * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
- * @vi:			vfs inode describing the bitmap
- * @start_bit:		first bit to set
- * @count:		number of bits to set
- * @value:		value to set the bits to (i.e. 0 or 1)
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi to @value, where @value is either 0 or 1.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
-		const s64 start_bit, const s64 count, const u8 value)
-{
-	return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
-			false);
-}
-
-/**
- * ntfs_bitmap_set_run - set a run of bits in a bitmap
- * @vi:		vfs inode describing the bitmap
- * @start_bit:	first bit to set
- * @count:	number of bits to set
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit,
-		const s64 count)
-{
-	return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1);
-}
-
-/**
- * ntfs_bitmap_clear_run - clear a run of bits in a bitmap
- * @vi:		vfs inode describing the bitmap
- * @start_bit:	first bit to clear
- * @count:	number of bits to clear
- *
- * Clear @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit,
-		const s64 count)
-{
-	return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0);
-}
-
-/**
- * ntfs_bitmap_set_bit - set a bit in a bitmap
- * @vi:		vfs inode describing the bitmap
- * @bit:	bit to set
- *
- * Set bit @bit in the bitmap described by the vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit)
-{
-	return ntfs_bitmap_set_run(vi, bit, 1);
-}
-
-/**
- * ntfs_bitmap_clear_bit - clear a bit in a bitmap
- * @vi:		vfs inode describing the bitmap
- * @bit:	bit to clear
- *
- * Clear bit @bit in the bitmap described by the vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit)
-{
-	return ntfs_bitmap_clear_run(vi, bit, 1);
-}
-
-#endif /* NTFS_RW */
-
-#endif /* defined _LINUX_NTFS_BITMAP_H */
diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
deleted file mode 100644
index 3ab6ec96abfe..000000000000
--- a/fs/ntfs/collate.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * collate.c - NTFS kernel collation handling.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#include "collate.h"
-#include "debug.h"
-#include "ntfs.h"
-
-static int ntfs_collate_binary(ntfs_volume *vol,
-		const void *data1, const int data1_len,
-		const void *data2, const int data2_len)
-{
-	int rc;
-
-	ntfs_debug("Entering.");
-	rc = memcmp(data1, data2, min(data1_len, data2_len));
-	if (!rc && (data1_len != data2_len)) {
-		if (data1_len < data2_len)
-			rc = -1;
-		else
-			rc = 1;
-	}
-	ntfs_debug("Done, returning %i", rc);
-	return rc;
-}
-
-static int ntfs_collate_ntofs_ulong(ntfs_volume *vol,
-		const void *data1, const int data1_len,
-		const void *data2, const int data2_len)
-{
-	int rc;
-	u32 d1, d2;
-
-	ntfs_debug("Entering.");
-	// FIXME:  We don't really want to bug here.
-	BUG_ON(data1_len != data2_len);
-	BUG_ON(data1_len != 4);
-	d1 = le32_to_cpup(data1);
-	d2 = le32_to_cpup(data2);
-	if (d1 < d2)
-		rc = -1;
-	else {
-		if (d1 == d2)
-			rc = 0;
-		else
-			rc = 1;
-	}
-	ntfs_debug("Done, returning %i", rc);
-	return rc;
-}
-
-typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int,
-		const void *, const int);
-
-static ntfs_collate_func_t ntfs_do_collate0x0[3] = {
-	ntfs_collate_binary,
-	NULL/*ntfs_collate_file_name*/,
-	NULL/*ntfs_collate_unicode_string*/,
-};
-
-static ntfs_collate_func_t ntfs_do_collate0x1[4] = {
-	ntfs_collate_ntofs_ulong,
-	NULL/*ntfs_collate_ntofs_sid*/,
-	NULL/*ntfs_collate_ntofs_security_hash*/,
-	NULL/*ntfs_collate_ntofs_ulongs*/,
-};
-
-/**
- * ntfs_collate - collate two data items using a specified collation rule
- * @vol:	ntfs volume to which the data items belong
- * @cr:		collation rule to use when comparing the items
- * @data1:	first data item to collate
- * @data1_len:	length in bytes of @data1
- * @data2:	second data item to collate
- * @data2_len:	length in bytes of @data2
- *
- * Collate the two data items @data1 and @data2 using the collation rule @cr
- * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before,
- * to match, or to collate after @data2.
- *
- * For speed we use the collation rule @cr as an index into two tables of
- * function pointers to call the appropriate collation function.
- */
-int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
-		const void *data1, const int data1_len,
-		const void *data2, const int data2_len) {
-	int i;
-
-	ntfs_debug("Entering.");
-	/*
-	 * FIXME:  At the moment we only support COLLATION_BINARY and
-	 * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now.
-	 */
-	BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG);
-	i = le32_to_cpu(cr);
-	BUG_ON(i < 0);
-	if (i <= 0x02)
-		return ntfs_do_collate0x0[i](vol, data1, data1_len,
-				data2, data2_len);
-	BUG_ON(i < 0x10);
-	i -= 0x10;
-	if (likely(i <= 3))
-		return ntfs_do_collate0x1[i](vol, data1, data1_len,
-				data2, data2_len);
-	BUG();
-	return 0;
-}
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
deleted file mode 100644
index f2255619b4f4..000000000000
--- a/fs/ntfs/collate.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * collate.h - Defines for NTFS kernel collation handling.  Part of the
- *	       Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_COLLATE_H
-#define _LINUX_NTFS_COLLATE_H
-
-#include "types.h"
-#include "volume.h"
-
-static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
-	int i;
-
-	/*
-	 * FIXME:  At the moment we only support COLLATION_BINARY and
-	 * COLLATION_NTOFS_ULONG, so we return false for everything else for
-	 * now.
-	 */
-	if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
-		return false;
-	i = le32_to_cpu(cr);
-	if (likely(((i >= 0) && (i <= 0x02)) ||
-			((i >= 0x10) && (i <= 0x13))))
-		return true;
-	return false;
-}
-
-extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
-		const void *data1, const int data1_len,
-		const void *data2, const int data2_len);
-
-#endif /* _LINUX_NTFS_COLLATE_H */
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
deleted file mode 100644
index 761aaa0195d6..000000000000
--- a/fs/ntfs/compress.c
+++ /dev/null
@@ -1,950 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * compress.c - NTFS kernel compressed attributes handling.
- *		Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include "attrib.h"
-#include "inode.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/**
- * ntfs_compression_constants - enum of constants used in the compression code
- */
-typedef enum {
-	/* Token types and access mask. */
-	NTFS_SYMBOL_TOKEN	=	0,
-	NTFS_PHRASE_TOKEN	=	1,
-	NTFS_TOKEN_MASK		=	1,
-
-	/* Compression sub-block constants. */
-	NTFS_SB_SIZE_MASK	=	0x0fff,
-	NTFS_SB_SIZE		=	0x1000,
-	NTFS_SB_IS_COMPRESSED	=	0x8000,
-
-	/*
-	 * The maximum compression block size is by definition 16 * the cluster
-	 * size, with the maximum supported cluster size being 4kiB. Thus the
-	 * maximum compression buffer size is 64kiB, so we use this when
-	 * initializing the compression buffer.
-	 */
-	NTFS_MAX_CB_SIZE	= 64 * 1024,
-} ntfs_compression_constants;
-
-/*
- * ntfs_compression_buffer - one buffer for the decompression engine
- */
-static u8 *ntfs_compression_buffer;
-
-/*
- * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
- */
-static DEFINE_SPINLOCK(ntfs_cb_lock);
-
-/**
- * allocate_compression_buffers - allocate the decompression buffers
- *
- * Caller has to hold the ntfs_lock mutex.
- *
- * Return 0 on success or -ENOMEM if the allocations failed.
- */
-int allocate_compression_buffers(void)
-{
-	BUG_ON(ntfs_compression_buffer);
-
-	ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE);
-	if (!ntfs_compression_buffer)
-		return -ENOMEM;
-	return 0;
-}
-
-/**
- * free_compression_buffers - free the decompression buffers
- *
- * Caller has to hold the ntfs_lock mutex.
- */
-void free_compression_buffers(void)
-{
-	BUG_ON(!ntfs_compression_buffer);
-	vfree(ntfs_compression_buffer);
-	ntfs_compression_buffer = NULL;
-}
-
-/**
- * zero_partial_compressed_page - zero out of bounds compressed page region
- */
-static void zero_partial_compressed_page(struct page *page,
-		const s64 initialized_size)
-{
-	u8 *kp = page_address(page);
-	unsigned int kp_ofs;
-
-	ntfs_debug("Zeroing page region outside initialized size.");
-	if (((s64)page->index << PAGE_SHIFT) >= initialized_size) {
-		clear_page(kp);
-		return;
-	}
-	kp_ofs = initialized_size & ~PAGE_MASK;
-	memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs);
-	return;
-}
-
-/**
- * handle_bounds_compressed_page - test for&handle out of bounds compressed page
- */
-static inline void handle_bounds_compressed_page(struct page *page,
-		const loff_t i_size, const s64 initialized_size)
-{
-	if ((page->index >= (initialized_size >> PAGE_SHIFT)) &&
-			(initialized_size < i_size))
-		zero_partial_compressed_page(page, initialized_size);
-	return;
-}
-
-/**
- * ntfs_decompress - decompress a compression block into an array of pages
- * @dest_pages:		destination array of pages
- * @completed_pages:	scratch space to track completed pages
- * @dest_index:		current index into @dest_pages (IN/OUT)
- * @dest_ofs:		current offset within @dest_pages[@dest_index] (IN/OUT)
- * @dest_max_index:	maximum index into @dest_pages (IN)
- * @dest_max_ofs:	maximum offset within @dest_pages[@dest_max_index] (IN)
- * @xpage:		the target page (-1 if none) (IN)
- * @xpage_done:		set to 1 if xpage was completed successfully (IN/OUT)
- * @cb_start:		compression block to decompress (IN)
- * @cb_size:		size of compression block @cb_start in bytes (IN)
- * @i_size:		file size when we started the read (IN)
- * @initialized_size:	initialized file size when we started the read (IN)
- *
- * The caller must have disabled preemption. ntfs_decompress() reenables it when
- * the critical section is finished.
- *
- * This decompresses the compression block @cb_start into the array of
- * destination pages @dest_pages starting at index @dest_index into @dest_pages
- * and at offset @dest_pos into the page @dest_pages[@dest_index].
- *
- * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1.
- * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified.
- *
- * @cb_start is a pointer to the compression block which needs decompressing
- * and @cb_size is the size of @cb_start in bytes (8-64kiB).
- *
- * Return 0 if success or -EOVERFLOW on error in the compressed stream.
- * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
- * completed during the decompression of the compression block (@cb_start).
- *
- * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up
- * unpredicatbly! You have been warned!
- *
- * Note to hackers: This function may not sleep until it has finished accessing
- * the compression block @cb_start as it is a per-CPU buffer.
- */
-static int ntfs_decompress(struct page *dest_pages[], int completed_pages[],
-		int *dest_index, int *dest_ofs, const int dest_max_index,
-		const int dest_max_ofs, const int xpage, char *xpage_done,
-		u8 *const cb_start, const u32 cb_size, const loff_t i_size,
-		const s64 initialized_size)
-{
-	/*
-	 * Pointers into the compressed data, i.e. the compression block (cb),
-	 * and the therein contained sub-blocks (sb).
-	 */
-	u8 *cb_end = cb_start + cb_size; /* End of cb. */
-	u8 *cb = cb_start;	/* Current position in cb. */
-	u8 *cb_sb_start;	/* Beginning of the current sb in the cb. */
-	u8 *cb_sb_end;		/* End of current sb / beginning of next sb. */
-
-	/* Variables for uncompressed data / destination. */
-	struct page *dp;	/* Current destination page being worked on. */
-	u8 *dp_addr;		/* Current pointer into dp. */
-	u8 *dp_sb_start;	/* Start of current sub-block in dp. */
-	u8 *dp_sb_end;		/* End of current sb in dp (dp_sb_start +
-				   NTFS_SB_SIZE). */
-	u16 do_sb_start;	/* @dest_ofs when starting this sub-block. */
-	u16 do_sb_end;		/* @dest_ofs of end of this sb (do_sb_start +
-				   NTFS_SB_SIZE). */
-
-	/* Variables for tag and token parsing. */
-	u8 tag;			/* Current tag. */
-	int token;		/* Loop counter for the eight tokens in tag. */
-	int nr_completed_pages = 0;
-
-	/* Default error code. */
-	int err = -EOVERFLOW;
-
-	ntfs_debug("Entering, cb_size = 0x%x.", cb_size);
-do_next_sb:
-	ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.",
-			cb - cb_start);
-	/*
-	 * Have we reached the end of the compression block or the end of the
-	 * decompressed data?  The latter can happen for example if the current
-	 * position in the compression block is one byte before its end so the
-	 * first two checks do not detect it.
-	 */
-	if (cb == cb_end || !le16_to_cpup((le16*)cb) ||
-			(*dest_index == dest_max_index &&
-			*dest_ofs == dest_max_ofs)) {
-		int i;
-
-		ntfs_debug("Completed. Returning success (0).");
-		err = 0;
-return_error:
-		/* We can sleep from now on, so we drop lock. */
-		spin_unlock(&ntfs_cb_lock);
-		/* Second stage: finalize completed pages. */
-		if (nr_completed_pages > 0) {
-			for (i = 0; i < nr_completed_pages; i++) {
-				int di = completed_pages[i];
-
-				dp = dest_pages[di];
-				/*
-				 * If we are outside the initialized size, zero
-				 * the out of bounds page range.
-				 */
-				handle_bounds_compressed_page(dp, i_size,
-						initialized_size);
-				flush_dcache_page(dp);
-				kunmap(dp);
-				SetPageUptodate(dp);
-				unlock_page(dp);
-				if (di == xpage)
-					*xpage_done = 1;
-				else
-					put_page(dp);
-				dest_pages[di] = NULL;
-			}
-		}
-		return err;
-	}
-
-	/* Setup offsets for the current sub-block destination. */
-	do_sb_start = *dest_ofs;
-	do_sb_end = do_sb_start + NTFS_SB_SIZE;
-
-	/* Check that we are still within allowed boundaries. */
-	if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs)
-		goto return_overflow;
-
-	/* Does the minimum size of a compressed sb overflow valid range? */
-	if (cb + 6 > cb_end)
-		goto return_overflow;
-
-	/* Setup the current sub-block source pointers and validate range. */
-	cb_sb_start = cb;
-	cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK)
-			+ 3;
-	if (cb_sb_end > cb_end)
-		goto return_overflow;
-
-	/* Get the current destination page. */
-	dp = dest_pages[*dest_index];
-	if (!dp) {
-		/* No page present. Skip decompression of this sub-block. */
-		cb = cb_sb_end;
-
-		/* Advance destination position to next sub-block. */
-		*dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK;
-		if (!*dest_ofs && (++*dest_index > dest_max_index))
-			goto return_overflow;
-		goto do_next_sb;
-	}
-
-	/* We have a valid destination page. Setup the destination pointers. */
-	dp_addr = (u8*)page_address(dp) + do_sb_start;
-
-	/* Now, we are ready to process the current sub-block (sb). */
-	if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
-		ntfs_debug("Found uncompressed sub-block.");
-		/* This sb is not compressed, just copy it into destination. */
-
-		/* Advance source position to first data byte. */
-		cb += 2;
-
-		/* An uncompressed sb must be full size. */
-		if (cb_sb_end - cb != NTFS_SB_SIZE)
-			goto return_overflow;
-
-		/* Copy the block and advance the source position. */
-		memcpy(dp_addr, cb, NTFS_SB_SIZE);
-		cb += NTFS_SB_SIZE;
-
-		/* Advance destination position to next sub-block. */
-		*dest_ofs += NTFS_SB_SIZE;
-		if (!(*dest_ofs &= ~PAGE_MASK)) {
-finalize_page:
-			/*
-			 * First stage: add current page index to array of
-			 * completed pages.
-			 */
-			completed_pages[nr_completed_pages++] = *dest_index;
-			if (++*dest_index > dest_max_index)
-				goto return_overflow;
-		}
-		goto do_next_sb;
-	}
-	ntfs_debug("Found compressed sub-block.");
-	/* This sb is compressed, decompress it into destination. */
-
-	/* Setup destination pointers. */
-	dp_sb_start = dp_addr;
-	dp_sb_end = dp_sb_start + NTFS_SB_SIZE;
-
-	/* Forward to the first tag in the sub-block. */
-	cb += 2;
-do_next_tag:
-	if (cb == cb_sb_end) {
-		/* Check if the decompressed sub-block was not full-length. */
-		if (dp_addr < dp_sb_end) {
-			int nr_bytes = do_sb_end - *dest_ofs;
-
-			ntfs_debug("Filling incomplete sub-block with "
-					"zeroes.");
-			/* Zero remainder and update destination position. */
-			memset(dp_addr, 0, nr_bytes);
-			*dest_ofs += nr_bytes;
-		}
-		/* We have finished the current sub-block. */
-		if (!(*dest_ofs &= ~PAGE_MASK))
-			goto finalize_page;
-		goto do_next_sb;
-	}
-
-	/* Check we are still in range. */
-	if (cb > cb_sb_end || dp_addr > dp_sb_end)
-		goto return_overflow;
-
-	/* Get the next tag and advance to first token. */
-	tag = *cb++;
-
-	/* Parse the eight tokens described by the tag. */
-	for (token = 0; token < 8; token++, tag >>= 1) {
-		u16 lg, pt, length, max_non_overlap;
-		register u16 i;
-		u8 *dp_back_addr;
-
-		/* Check if we are done / still in range. */
-		if (cb >= cb_sb_end || dp_addr > dp_sb_end)
-			break;
-
-		/* Determine token type and parse appropriately.*/
-		if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
-			/*
-			 * We have a symbol token, copy the symbol across, and
-			 * advance the source and destination positions.
-			 */
-			*dp_addr++ = *cb++;
-			++*dest_ofs;
-
-			/* Continue with the next token. */
-			continue;
-		}
-
-		/*
-		 * We have a phrase token. Make sure it is not the first tag in
-		 * the sb as this is illegal and would confuse the code below.
-		 */
-		if (dp_addr == dp_sb_start)
-			goto return_overflow;
-
-		/*
-		 * Determine the number of bytes to go back (p) and the number
-		 * of bytes to copy (l). We use an optimized algorithm in which
-		 * we first calculate log2(current destination position in sb),
-		 * which allows determination of l and p in O(1) rather than
-		 * O(n). We just need an arch-optimized log2() function now.
-		 */
-		lg = 0;
-		for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1)
-			lg++;
-
-		/* Get the phrase token into i. */
-		pt = le16_to_cpup((le16*)cb);
-
-		/*
-		 * Calculate starting position of the byte sequence in
-		 * the destination using the fact that p = (pt >> (12 - lg)) + 1
-		 * and make sure we don't go too far back.
-		 */
-		dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1;
-		if (dp_back_addr < dp_sb_start)
-			goto return_overflow;
-
-		/* Now calculate the length of the byte sequence. */
-		length = (pt & (0xfff >> lg)) + 3;
-
-		/* Advance destination position and verify it is in range. */
-		*dest_ofs += length;
-		if (*dest_ofs > do_sb_end)
-			goto return_overflow;
-
-		/* The number of non-overlapping bytes. */
-		max_non_overlap = dp_addr - dp_back_addr;
-
-		if (length <= max_non_overlap) {
-			/* The byte sequence doesn't overlap, just copy it. */
-			memcpy(dp_addr, dp_back_addr, length);
-
-			/* Advance destination pointer. */
-			dp_addr += length;
-		} else {
-			/*
-			 * The byte sequence does overlap, copy non-overlapping
-			 * part and then do a slow byte by byte copy for the
-			 * overlapping part. Also, advance the destination
-			 * pointer.
-			 */
-			memcpy(dp_addr, dp_back_addr, max_non_overlap);
-			dp_addr += max_non_overlap;
-			dp_back_addr += max_non_overlap;
-			length -= max_non_overlap;
-			while (length--)
-				*dp_addr++ = *dp_back_addr++;
-		}
-
-		/* Advance source position and continue with the next token. */
-		cb += 2;
-	}
-
-	/* No tokens left in the current tag. Continue with the next tag. */
-	goto do_next_tag;
-
-return_overflow:
-	ntfs_error(NULL, "Failed. Returning -EOVERFLOW.");
-	goto return_error;
-}
-
-/**
- * ntfs_read_compressed_block - read a compressed block into the page cache
- * @page:	locked page in the compression block(s) we need to read
- *
- * When we are called the page has already been verified to be locked and the
- * attribute is known to be non-resident, not encrypted, but compressed.
- *
- * 1. Determine which compression block(s) @page is in.
- * 2. Get hold of all pages corresponding to this/these compression block(s).
- * 3. Read the (first) compression block.
- * 4. Decompress it into the corresponding pages.
- * 5. Throw the compressed data away and proceed to 3. for the next compression
- *    block or return success if no more compression blocks left.
- *
- * Warning: We have to be careful what we do about existing pages. They might
- * have been written to so that we would lose data if we were to just overwrite
- * them with the out-of-date uncompressed data.
- *
- * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at
- * the end of the file I think. We need to detect this case and zero the out
- * of bounds remainder of the page in question and mark it as handled. At the
- * moment we would just return -EIO on such a page. This bug will only become
- * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte
- * clusters so is probably not going to be seen by anyone. Still this should
- * be fixed. (AIA)
- *
- * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in
- * handling sparse and compressed cbs. (AIA)
- *
- * FIXME: At the moment we don't do any zeroing out in the case that
- * initialized_size is less than data_size. This should be safe because of the
- * nature of the compression algorithm used. Just in case we check and output
- * an error message in read inode if the two sizes are not equal for a
- * compressed file. (AIA)
- */
-int ntfs_read_compressed_block(struct page *page)
-{
-	loff_t i_size;
-	s64 initialized_size;
-	struct address_space *mapping = page->mapping;
-	ntfs_inode *ni = NTFS_I(mapping->host);
-	ntfs_volume *vol = ni->vol;
-	struct super_block *sb = vol->sb;
-	runlist_element *rl;
-	unsigned long flags, block_size = sb->s_blocksize;
-	unsigned char block_size_bits = sb->s_blocksize_bits;
-	u8 *cb, *cb_pos, *cb_end;
-	struct buffer_head **bhs;
-	unsigned long offset, index = page->index;
-	u32 cb_size = ni->itype.compressed.block_size;
-	u64 cb_size_mask = cb_size - 1UL;
-	VCN vcn;
-	LCN lcn;
-	/* The first wanted vcn (minimum alignment is PAGE_SIZE). */
-	VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >>
-			vol->cluster_size_bits;
-	/*
-	 * The first vcn after the last wanted vcn (minimum alignment is again
-	 * PAGE_SIZE.
-	 */
-	VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1)
-			& ~cb_size_mask) >> vol->cluster_size_bits;
-	/* Number of compression blocks (cbs) in the wanted vcn range. */
-	unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
-			>> ni->itype.compressed.block_size_bits;
-	/*
-	 * Number of pages required to store the uncompressed data from all
-	 * compression blocks (cbs) overlapping @page. Due to alignment
-	 * guarantees of start_vcn and end_vcn, no need to round up here.
-	 */
-	unsigned int nr_pages = (end_vcn - start_vcn) <<
-			vol->cluster_size_bits >> PAGE_SHIFT;
-	unsigned int xpage, max_page, cur_page, cur_ofs, i;
-	unsigned int cb_clusters, cb_max_ofs;
-	int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
-	struct page **pages;
-	int *completed_pages;
-	unsigned char xpage_done = 0;
-
-	ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
-			"%i.", index, cb_size, nr_pages);
-	/*
-	 * Bad things happen if we get here for anything that is not an
-	 * unnamed $DATA attribute.
-	 */
-	BUG_ON(ni->type != AT_DATA);
-	BUG_ON(ni->name_len);
-
-	pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
-	completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS);
-
-	/* Allocate memory to store the buffer heads we need. */
-	bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
-	bhs = kmalloc(bhs_size, GFP_NOFS);
-
-	if (unlikely(!pages || !bhs || !completed_pages)) {
-		kfree(bhs);
-		kfree(pages);
-		kfree(completed_pages);
-		unlock_page(page);
-		ntfs_error(vol->sb, "Failed to allocate internal buffers.");
-		return -ENOMEM;
-	}
-
-	/*
-	 * We have already been given one page, this is the one we must do.
-	 * Once again, the alignment guarantees keep it simple.
-	 */
-	offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT;
-	xpage = index - offset;
-	pages[xpage] = page;
-	/*
-	 * The remaining pages need to be allocated and inserted into the page
-	 * cache, alignment guarantees keep all the below much simpler. (-8
-	 */
-	read_lock_irqsave(&ni->size_lock, flags);
-	i_size = i_size_read(VFS_I(ni));
-	initialized_size = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) -
-			offset;
-	/* Is the page fully outside i_size? (truncate in progress) */
-	if (xpage >= max_page) {
-		kfree(bhs);
-		kfree(pages);
-		kfree(completed_pages);
-		zero_user(page, 0, PAGE_SIZE);
-		ntfs_debug("Compressed read outside i_size - truncated?");
-		SetPageUptodate(page);
-		unlock_page(page);
-		return 0;
-	}
-	if (nr_pages < max_page)
-		max_page = nr_pages;
-	for (i = 0; i < max_page; i++, offset++) {
-		if (i != xpage)
-			pages[i] = grab_cache_page_nowait(mapping, offset);
-		page = pages[i];
-		if (page) {
-			/*
-			 * We only (re)read the page if it isn't already read
-			 * in and/or dirty or we would be losing data or at
-			 * least wasting our time.
-			 */
-			if (!PageDirty(page) && (!PageUptodate(page) ||
-					PageError(page))) {
-				ClearPageError(page);
-				kmap(page);
-				continue;
-			}
-			unlock_page(page);
-			put_page(page);
-			pages[i] = NULL;
-		}
-	}
-
-	/*
-	 * We have the runlist, and all the destination pages we need to fill.
-	 * Now read the first compression block.
-	 */
-	cur_page = 0;
-	cur_ofs = 0;
-	cb_clusters = ni->itype.compressed.block_clusters;
-do_next_cb:
-	nr_cbs--;
-	nr_bhs = 0;
-
-	/* Read all cb buffer heads one cluster at a time. */
-	rl = NULL;
-	for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
-			vcn++) {
-		bool is_retry = false;
-
-		if (!rl) {
-lock_retry_remap:
-			down_read(&ni->runlist.lock);
-			rl = ni->runlist.rl;
-		}
-		if (likely(rl != NULL)) {
-			/* Seek to element containing target vcn. */
-			while (rl->length && rl[1].vcn <= vcn)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-		} else
-			lcn = LCN_RL_NOT_MAPPED;
-		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
-				(unsigned long long)vcn,
-				(unsigned long long)lcn);
-		if (lcn < 0) {
-			/*
-			 * When we reach the first sparse cluster we have
-			 * finished with the cb.
-			 */
-			if (lcn == LCN_HOLE)
-				break;
-			if (is_retry || lcn != LCN_RL_NOT_MAPPED)
-				goto rl_err;
-			is_retry = true;
-			/*
-			 * Attempt to map runlist, dropping lock for the
-			 * duration.
-			 */
-			up_read(&ni->runlist.lock);
-			if (!ntfs_map_runlist(ni, vcn))
-				goto lock_retry_remap;
-			goto map_rl_err;
-		}
-		block = lcn << vol->cluster_size_bits >> block_size_bits;
-		/* Read the lcn from device in chunks of block_size bytes. */
-		max_block = block + (vol->cluster_size >> block_size_bits);
-		do {
-			ntfs_debug("block = 0x%x.", block);
-			if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block))))
-				goto getblk_err;
-			nr_bhs++;
-		} while (++block < max_block);
-	}
-
-	/* Release the lock if we took it. */
-	if (rl)
-		up_read(&ni->runlist.lock);
-
-	/* Setup and initiate io on all buffer heads. */
-	for (i = 0; i < nr_bhs; i++) {
-		struct buffer_head *tbh = bhs[i];
-
-		if (!trylock_buffer(tbh))
-			continue;
-		if (unlikely(buffer_uptodate(tbh))) {
-			unlock_buffer(tbh);
-			continue;
-		}
-		get_bh(tbh);
-		tbh->b_end_io = end_buffer_read_sync;
-		submit_bh(REQ_OP_READ, tbh);
-	}
-
-	/* Wait for io completion on all buffer heads. */
-	for (i = 0; i < nr_bhs; i++) {
-		struct buffer_head *tbh = bhs[i];
-
-		if (buffer_uptodate(tbh))
-			continue;
-		wait_on_buffer(tbh);
-		/*
-		 * We need an optimization barrier here, otherwise we start
-		 * hitting the below fixup code when accessing a loopback
-		 * mounted ntfs partition. This indicates either there is a
-		 * race condition in the loop driver or, more likely, gcc
-		 * overoptimises the code without the barrier and it doesn't
-		 * do the Right Thing(TM).
-		 */
-		barrier();
-		if (unlikely(!buffer_uptodate(tbh))) {
-			ntfs_warning(vol->sb, "Buffer is unlocked but not "
-					"uptodate! Unplugging the disk queue "
-					"and rescheduling.");
-			get_bh(tbh);
-			io_schedule();
-			put_bh(tbh);
-			if (unlikely(!buffer_uptodate(tbh)))
-				goto read_err;
-			ntfs_warning(vol->sb, "Buffer is now uptodate. Good.");
-		}
-	}
-
-	/*
-	 * Get the compression buffer. We must not sleep any more
-	 * until we are finished with it.
-	 */
-	spin_lock(&ntfs_cb_lock);
-	cb = ntfs_compression_buffer;
-
-	BUG_ON(!cb);
-
-	cb_pos = cb;
-	cb_end = cb + cb_size;
-
-	/* Copy the buffer heads into the contiguous buffer. */
-	for (i = 0; i < nr_bhs; i++) {
-		memcpy(cb_pos, bhs[i]->b_data, block_size);
-		cb_pos += block_size;
-	}
-
-	/* Just a precaution. */
-	if (cb_pos + 2 <= cb + cb_size)
-		*(u16*)cb_pos = 0;
-
-	/* Reset cb_pos back to the beginning. */
-	cb_pos = cb;
-
-	/* We now have both source (if present) and destination. */
-	ntfs_debug("Successfully read the compression block.");
-
-	/* The last page and maximum offset within it for the current cb. */
-	cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size;
-	cb_max_ofs = cb_max_page & ~PAGE_MASK;
-	cb_max_page >>= PAGE_SHIFT;
-
-	/* Catch end of file inside a compression block. */
-	if (cb_max_page > max_page)
-		cb_max_page = max_page;
-
-	if (vcn == start_vcn - cb_clusters) {
-		/* Sparse cb, zero out page range overlapping the cb. */
-		ntfs_debug("Found sparse compression block.");
-		/* We can sleep from now on, so we drop lock. */
-		spin_unlock(&ntfs_cb_lock);
-		if (cb_max_ofs)
-			cb_max_page--;
-		for (; cur_page < cb_max_page; cur_page++) {
-			page = pages[cur_page];
-			if (page) {
-				if (likely(!cur_ofs))
-					clear_page(page_address(page));
-				else
-					memset(page_address(page) + cur_ofs, 0,
-							PAGE_SIZE -
-							cur_ofs);
-				flush_dcache_page(page);
-				kunmap(page);
-				SetPageUptodate(page);
-				unlock_page(page);
-				if (cur_page == xpage)
-					xpage_done = 1;
-				else
-					put_page(page);
-				pages[cur_page] = NULL;
-			}
-			cb_pos += PAGE_SIZE - cur_ofs;
-			cur_ofs = 0;
-			if (cb_pos >= cb_end)
-				break;
-		}
-		/* If we have a partial final page, deal with it now. */
-		if (cb_max_ofs && cb_pos < cb_end) {
-			page = pages[cur_page];
-			if (page)
-				memset(page_address(page) + cur_ofs, 0,
-						cb_max_ofs - cur_ofs);
-			/*
-			 * No need to update cb_pos at this stage:
-			 *	cb_pos += cb_max_ofs - cur_ofs;
-			 */
-			cur_ofs = cb_max_ofs;
-		}
-	} else if (vcn == start_vcn) {
-		/* We can't sleep so we need two stages. */
-		unsigned int cur2_page = cur_page;
-		unsigned int cur_ofs2 = cur_ofs;
-		u8 *cb_pos2 = cb_pos;
-
-		ntfs_debug("Found uncompressed compression block.");
-		/* Uncompressed cb, copy it to the destination pages. */
-		/*
-		 * TODO: As a big optimization, we could detect this case
-		 * before we read all the pages and use block_read_full_folio()
-		 * on all full pages instead (we still have to treat partial
-		 * pages especially but at least we are getting rid of the
-		 * synchronous io for the majority of pages.
-		 * Or if we choose not to do the read-ahead/-behind stuff, we
-		 * could just return block_read_full_folio(pages[xpage]) as long
-		 * as PAGE_SIZE <= cb_size.
-		 */
-		if (cb_max_ofs)
-			cb_max_page--;
-		/* First stage: copy data into destination pages. */
-		for (; cur_page < cb_max_page; cur_page++) {
-			page = pages[cur_page];
-			if (page)
-				memcpy(page_address(page) + cur_ofs, cb_pos,
-						PAGE_SIZE - cur_ofs);
-			cb_pos += PAGE_SIZE - cur_ofs;
-			cur_ofs = 0;
-			if (cb_pos >= cb_end)
-				break;
-		}
-		/* If we have a partial final page, deal with it now. */
-		if (cb_max_ofs && cb_pos < cb_end) {
-			page = pages[cur_page];
-			if (page)
-				memcpy(page_address(page) + cur_ofs, cb_pos,
-						cb_max_ofs - cur_ofs);
-			cb_pos += cb_max_ofs - cur_ofs;
-			cur_ofs = cb_max_ofs;
-		}
-		/* We can sleep from now on, so drop lock. */
-		spin_unlock(&ntfs_cb_lock);
-		/* Second stage: finalize pages. */
-		for (; cur2_page < cb_max_page; cur2_page++) {
-			page = pages[cur2_page];
-			if (page) {
-				/*
-				 * If we are outside the initialized size, zero
-				 * the out of bounds page range.
-				 */
-				handle_bounds_compressed_page(page, i_size,
-						initialized_size);
-				flush_dcache_page(page);
-				kunmap(page);
-				SetPageUptodate(page);
-				unlock_page(page);
-				if (cur2_page == xpage)
-					xpage_done = 1;
-				else
-					put_page(page);
-				pages[cur2_page] = NULL;
-			}
-			cb_pos2 += PAGE_SIZE - cur_ofs2;
-			cur_ofs2 = 0;
-			if (cb_pos2 >= cb_end)
-				break;
-		}
-	} else {
-		/* Compressed cb, decompress it into the destination page(s). */
-		unsigned int prev_cur_page = cur_page;
-
-		ntfs_debug("Found compressed compression block.");
-		err = ntfs_decompress(pages, completed_pages, &cur_page,
-				&cur_ofs, cb_max_page, cb_max_ofs, xpage,
-				&xpage_done, cb_pos, cb_size - (cb_pos - cb),
-				i_size, initialized_size);
-		/*
-		 * We can sleep from now on, lock already dropped by
-		 * ntfs_decompress().
-		 */
-		if (err) {
-			ntfs_error(vol->sb, "ntfs_decompress() failed in inode "
-					"0x%lx with error code %i. Skipping "
-					"this compression block.",
-					ni->mft_no, -err);
-			/* Release the unfinished pages. */
-			for (; prev_cur_page < cur_page; prev_cur_page++) {
-				page = pages[prev_cur_page];
-				if (page) {
-					flush_dcache_page(page);
-					kunmap(page);
-					unlock_page(page);
-					if (prev_cur_page != xpage)
-						put_page(page);
-					pages[prev_cur_page] = NULL;
-				}
-			}
-		}
-	}
-
-	/* Release the buffer heads. */
-	for (i = 0; i < nr_bhs; i++)
-		brelse(bhs[i]);
-
-	/* Do we have more work to do? */
-	if (nr_cbs)
-		goto do_next_cb;
-
-	/* We no longer need the list of buffer heads. */
-	kfree(bhs);
-
-	/* Clean up if we have any pages left. Should never happen. */
-	for (cur_page = 0; cur_page < max_page; cur_page++) {
-		page = pages[cur_page];
-		if (page) {
-			ntfs_error(vol->sb, "Still have pages left! "
-					"Terminating them with extreme "
-					"prejudice.  Inode 0x%lx, page index "
-					"0x%lx.", ni->mft_no, page->index);
-			flush_dcache_page(page);
-			kunmap(page);
-			unlock_page(page);
-			if (cur_page != xpage)
-				put_page(page);
-			pages[cur_page] = NULL;
-		}
-	}
-
-	/* We no longer need the list of pages. */
-	kfree(pages);
-	kfree(completed_pages);
-
-	/* If we have completed the requested page, we return success. */
-	if (likely(xpage_done))
-		return 0;
-
-	ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
-			"EOVERFLOW" : (!err ? "EIO" : "unknown error"));
-	return err < 0 ? err : -EIO;
-
-read_err:
-	ntfs_error(vol->sb, "IO error while reading compressed data.");
-	/* Release the buffer heads. */
-	for (i = 0; i < nr_bhs; i++)
-		brelse(bhs[i]);
-	goto err_out;
-
-map_rl_err:
-	ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read "
-			"compression block.");
-	goto err_out;
-
-rl_err:
-	up_read(&ni->runlist.lock);
-	ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
-			"compression block.");
-	goto err_out;
-
-getblk_err:
-	up_read(&ni->runlist.lock);
-	ntfs_error(vol->sb, "getblk() failed. Cannot read compression block.");
-
-err_out:
-	kfree(bhs);
-	for (i = cur_page; i < max_page; i++) {
-		page = pages[i];
-		if (page) {
-			flush_dcache_page(page);
-			kunmap(page);
-			unlock_page(page);
-			if (i != xpage)
-				put_page(page);
-		}
-	}
-	kfree(pages);
-	kfree(completed_pages);
-	return -EIO;
-}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
deleted file mode 100644
index a3c1c5656f8f..000000000000
--- a/fs/ntfs/debug.c
+++ /dev/null
@@ -1,159 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include "debug.h"
-
-/**
- * __ntfs_warning - output a warning to the syslog
- * @function:	name of function outputting the warning
- * @sb:		super block of mounted ntfs filesystem
- * @fmt:	warning string containing format specifications
- * @...:	a variable number of arguments specified in @fmt
- *
- * Outputs a warning to the syslog for the mounted ntfs filesystem described
- * by @sb.
- *
- * @fmt and the corresponding @... is printf style format string containing
- * the warning string and the corresponding format arguments, respectively.
- *
- * @function is the name of the function from which __ntfs_warning is being
- * called.
- *
- * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead
- * as this provides the @function parameter automatically.
- */
-void __ntfs_warning(const char *function, const struct super_block *sb,
-		const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-	int flen = 0;
-
-#ifndef DEBUG
-	if (!printk_ratelimit())
-		return;
-#endif
-	if (function)
-		flen = strlen(function);
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	if (sb)
-		pr_warn("(device %s): %s(): %pV\n",
-			sb->s_id, flen ? function : "", &vaf);
-	else
-		pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
-	va_end(args);
-}
-
-/**
- * __ntfs_error - output an error to the syslog
- * @function:	name of function outputting the error
- * @sb:		super block of mounted ntfs filesystem
- * @fmt:	error string containing format specifications
- * @...:	a variable number of arguments specified in @fmt
- *
- * Outputs an error to the syslog for the mounted ntfs filesystem described
- * by @sb.
- *
- * @fmt and the corresponding @... is printf style format string containing
- * the error string and the corresponding format arguments, respectively.
- *
- * @function is the name of the function from which __ntfs_error is being
- * called.
- *
- * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead
- * as this provides the @function parameter automatically.
- */
-void __ntfs_error(const char *function, const struct super_block *sb,
-		const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-	int flen = 0;
-
-#ifndef DEBUG
-	if (!printk_ratelimit())
-		return;
-#endif
-	if (function)
-		flen = strlen(function);
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	if (sb)
-		pr_err("(device %s): %s(): %pV\n",
-		       sb->s_id, flen ? function : "", &vaf);
-	else
-		pr_err("%s(): %pV\n", flen ? function : "", &vaf);
-	va_end(args);
-}
-
-#ifdef DEBUG
-
-/* If 1, output debug messages, and if 0, don't. */
-int debug_msgs = 0;
-
-void __ntfs_debug(const char *file, int line, const char *function,
-		const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-	int flen = 0;
-
-	if (!debug_msgs)
-		return;
-	if (function)
-		flen = strlen(function);
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
-	va_end(args);
-}
-
-/* Dump a runlist. Caller has to provide synchronisation for @rl. */
-void ntfs_debug_dump_runlist(const runlist_element *rl)
-{
-	int i;
-	const char *lcn_str[5] = { "LCN_HOLE         ", "LCN_RL_NOT_MAPPED",
-				   "LCN_ENOENT       ", "LCN_unknown      " };
-
-	if (!debug_msgs)
-		return;
-	pr_debug("Dumping runlist (values in hex):\n");
-	if (!rl) {
-		pr_debug("Run list not present.\n");
-		return;
-	}
-	pr_debug("VCN              LCN               Run length\n");
-	for (i = 0; ; i++) {
-		LCN lcn = (rl + i)->lcn;
-
-		if (lcn < (LCN)0) {
-			int index = -lcn - 1;
-
-			if (index > -LCN_ENOENT - 1)
-				index = 3;
-			pr_debug("%-16Lx %s %-16Lx%s\n",
-					(long long)(rl + i)->vcn, lcn_str[index],
-					(long long)(rl + i)->length,
-					(rl + i)->length ? "" :
-						" (runlist end)");
-		} else
-			pr_debug("%-16Lx %-16Lx  %-16Lx%s\n",
-					(long long)(rl + i)->vcn,
-					(long long)(rl + i)->lcn,
-					(long long)(rl + i)->length,
-					(rl + i)->length ? "" :
-						" (runlist end)");
-		if (!(rl + i)->length)
-			break;
-	}
-}
-
-#endif
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
deleted file mode 100644
index 6fdef388f129..000000000000
--- a/fs/ntfs/debug.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_DEBUG_H
-#define _LINUX_NTFS_DEBUG_H
-
-#include <linux/fs.h>
-
-#include "runlist.h"
-
-#ifdef DEBUG
-
-extern int debug_msgs;
-
-extern __printf(4, 5)
-void __ntfs_debug(const char *file, int line, const char *function,
-		  const char *format, ...);
-/**
- * ntfs_debug - write a debug level message to syslog
- * @f:		a printf format string containing the message
- * @...:	the variables to substitute into @f
- *
- * ntfs_debug() writes a DEBUG level message to the syslog but only if the
- * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
- */
-#define ntfs_debug(f, a...)						\
-	__ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
-
-extern void ntfs_debug_dump_runlist(const runlist_element *rl);
-
-#else	/* !DEBUG */
-
-#define ntfs_debug(fmt, ...)						\
-do {									\
-	if (0)								\
-		no_printk(fmt, ##__VA_ARGS__);				\
-} while (0)
-
-#define ntfs_debug_dump_runlist(rl)	do {} while (0)
-
-#endif	/* !DEBUG */
-
-extern  __printf(3, 4)
-void __ntfs_warning(const char *function, const struct super_block *sb,
-		    const char *fmt, ...);
-#define ntfs_warning(sb, f, a...)	__ntfs_warning(__func__, sb, f, ##a)
-
-extern  __printf(3, 4)
-void __ntfs_error(const char *function, const struct super_block *sb,
-		  const char *fmt, ...);
-#define ntfs_error(sb, f, a...)		__ntfs_error(__func__, sb, f, ##a)
-
-#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
deleted file mode 100644
index 629723a8d712..000000000000
--- a/fs/ntfs/dir.c
+++ /dev/null
@@ -1,1540 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-
-#include "dir.h"
-#include "aops.h"
-#include "attrib.h"
-#include "mft.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/*
- * The little endian Unicode string $I30 as a global constant.
- */
-ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
-		cpu_to_le16('3'),	cpu_to_le16('0'), 0 };
-
-/**
- * ntfs_lookup_inode_by_name - find an inode in a directory given its name
- * @dir_ni:	ntfs inode of the directory in which to search for the name
- * @uname:	Unicode name for which to search in the directory
- * @uname_len:	length of the name @uname in Unicode characters
- * @res:	return the found file name if necessary (see below)
- *
- * Look for an inode with name @uname in the directory with inode @dir_ni.
- * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
- * the Unicode name. If the name is found in the directory, the corresponding
- * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
- * is a 64-bit number containing the sequence number.
- *
- * On error, a negative value is returned corresponding to the error code. In
- * particular if the inode is not found -ENOENT is returned. Note that you
- * can't just check the return value for being negative, you have to check the
- * inode number for being negative which you can extract using MREC(return
- * value).
- *
- * Note, @uname_len does not include the (optional) terminating NULL character.
- *
- * Note, we look for a case sensitive match first but we also look for a case
- * insensitive match at the same time. If we find a case insensitive match, we
- * save that for the case that we don't find an exact match, where we return
- * the case insensitive match and setup @res (which we allocate!) with the mft
- * reference, the file name type, length and with a copy of the little endian
- * Unicode file name itself. If we match a file name which is in the DOS name
- * space, we only return the mft reference and file name type in @res.
- * ntfs_lookup() then uses this to find the long file name in the inode itself.
- * This is to avoid polluting the dcache with short file names. We want them to
- * work but we don't care for how quickly one can access them. This also fixes
- * the dcache aliasing issues.
- *
- * Locking:  - Caller must hold i_mutex on the directory.
- *	     - Each page cache page in the index allocation mapping must be
- *	       locked whilst being accessed otherwise we may find a corrupt
- *	       page due to it being under ->writepage at the moment which
- *	       applies the mst protection fixups before writing out and then
- *	       removes them again after the write is complete after which it 
- *	       unlocks the page.
- */
-MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
-		const int uname_len, ntfs_name **res)
-{
-	ntfs_volume *vol = dir_ni->vol;
-	struct super_block *sb = vol->sb;
-	MFT_RECORD *m;
-	INDEX_ROOT *ir;
-	INDEX_ENTRY *ie;
-	INDEX_ALLOCATION *ia;
-	u8 *index_end;
-	u64 mref;
-	ntfs_attr_search_ctx *ctx;
-	int err, rc;
-	VCN vcn, old_vcn;
-	struct address_space *ia_mapping;
-	struct page *page;
-	u8 *kaddr;
-	ntfs_name *name = NULL;
-
-	BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
-	BUG_ON(NInoAttr(dir_ni));
-	/* Get hold of the mft record for the directory. */
-	m = map_mft_record(dir_ni);
-	if (IS_ERR(m)) {
-		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
-				-PTR_ERR(m));
-		return ERR_MREF(PTR_ERR(m));
-	}
-	ctx = ntfs_attr_get_search_ctx(dir_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Find the index root attribute in the mft record. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
-			0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			ntfs_error(sb, "Index root attribute missing in "
-					"directory inode 0x%lx.",
-					dir_ni->mft_no);
-			err = -EIO;
-		}
-		goto err_out;
-	}
-	/* Get to the index root value (it's been verified in read_inode). */
-	ir = (INDEX_ROOT*)((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ir->index +
-			le32_to_cpu(ir->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds checks. */
-		if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end)
-			goto dir_err_out;
-		/*
-		 * The last entry cannot contain a name. It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/*
-		 * We perform a case sensitive comparison and if that matches
-		 * we are done and return the mft reference of the inode (i.e.
-		 * the inode number together with the sequence number for
-		 * consistency checking). We convert it to cpu format before
-		 * returning.
-		 */
-		if (ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
-found_it:
-			/*
-			 * We have a perfect match, so we don't need to care
-			 * about having matched imperfectly before, so we can
-			 * free name and set *res to NULL.
-			 * However, if the perfect match is a short file name,
-			 * we need to signal this through *res, so that
-			 * ntfs_lookup() can fix dcache aliasing issues.
-			 * As an optimization we just reuse an existing
-			 * allocation of *res.
-			 */
-			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
-				if (!name) {
-					name = kmalloc(sizeof(ntfs_name),
-							GFP_NOFS);
-					if (!name) {
-						err = -ENOMEM;
-						goto err_out;
-					}
-				}
-				name->mref = le64_to_cpu(
-						ie->data.dir.indexed_file);
-				name->type = FILE_NAME_DOS;
-				name->len = 0;
-				*res = name;
-			} else {
-				kfree(name);
-				*res = NULL;
-			}
-			mref = le64_to_cpu(ie->data.dir.indexed_file);
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(dir_ni);
-			return mref;
-		}
-		/*
-		 * For a case insensitive mount, we also perform a case
-		 * insensitive comparison (provided the file name is not in the
-		 * POSIX namespace). If the comparison matches, and the name is
-		 * in the WIN32 namespace, we cache the filename in *res so
-		 * that the caller, ntfs_lookup(), can work on it. If the
-		 * comparison matches, and the name is in the DOS namespace, we
-		 * only cache the mft reference and the file name type (we set
-		 * the name length to zero for simplicity).
-		 */
-		if (!NVolCaseSensitive(vol) &&
-				ie->key.file_name.file_name_type &&
-				ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length,
-				IGNORE_CASE, vol->upcase, vol->upcase_len)) {
-			int name_size = sizeof(ntfs_name);
-			u8 type = ie->key.file_name.file_name_type;
-			u8 len = ie->key.file_name.file_name_length;
-
-			/* Only one case insensitive matching name allowed. */
-			if (name) {
-				ntfs_error(sb, "Found already allocated name "
-						"in phase 1. Please run chkdsk "
-						"and if that doesn't find any "
-						"errors please report you saw "
-						"this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net.");
-				goto dir_err_out;
-			}
-
-			if (type != FILE_NAME_DOS)
-				name_size += len * sizeof(ntfschar);
-			name = kmalloc(name_size, GFP_NOFS);
-			if (!name) {
-				err = -ENOMEM;
-				goto err_out;
-			}
-			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
-			name->type = type;
-			if (type != FILE_NAME_DOS) {
-				name->len = len;
-				memcpy(name->name, ie->key.file_name.file_name,
-						len * sizeof(ntfschar));
-			} else
-				name->len = 0;
-			*res = name;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				IGNORE_CASE, vol->upcase, vol->upcase_len);
-		/*
-		 * If uname collates before the name of the current entry, there
-		 * is definitely no such name in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/* The names are not equal, continue the search. */
-		if (rc)
-			continue;
-		/*
-		 * Names match with case insensitive comparison, now try the
-		 * case sensitive comparison, which is required for proper
-		 * collation.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
-		if (rc == -1)
-			break;
-		if (rc)
-			continue;
-		/*
-		 * Perfect match, this will never happen as the
-		 * ntfs_are_names_equal() call will have gotten a match but we
-		 * still treat it correctly.
-		 */
-		goto found_it;
-	}
-	/*
-	 * We have finished with this index without success. Check for the
-	 * presence of a child node and if not present return -ENOENT, unless
-	 * we have got a matching name cached in name in which case return the
-	 * mft reference associated with it.
-	 */
-	if (!(ie->flags & INDEX_ENTRY_NODE)) {
-		if (name) {
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(dir_ni);
-			return name->mref;
-		}
-		ntfs_debug("Entry not found.");
-		err = -ENOENT;
-		goto err_out;
-	} /* Child node present, descend into it. */
-	/* Consistency check: Verify that an index allocation exists. */
-	if (!NInoIndexAllocPresent(dir_ni)) {
-		ntfs_error(sb, "No index allocation attribute but index entry "
-				"requires one. Directory inode 0x%lx is "
-				"corrupt or driver bug.", dir_ni->mft_no);
-		goto err_out;
-	}
-	/* Get the starting vcn of the index_block holding the child node. */
-	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
-	ia_mapping = VFS_I(dir_ni)->i_mapping;
-	/*
-	 * We are done with the index root and the mft record. Release them,
-	 * otherwise we deadlock with ntfs_map_page().
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(dir_ni);
-	m = NULL;
-	ctx = NULL;
-descend_into_child_node:
-	/*
-	 * Convert vcn to index into the index allocation attribute in units
-	 * of PAGE_SIZE and map the page cache page, reading it from
-	 * disk if necessary.
-	 */
-	page = ntfs_map_page(ia_mapping, vcn <<
-			dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
-	if (IS_ERR(page)) {
-		ntfs_error(sb, "Failed to map directory index page, error %ld.",
-				-PTR_ERR(page));
-		err = PTR_ERR(page);
-		goto err_out;
-	}
-	lock_page(page);
-	kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
-	/* Get to the index allocation block. */
-	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
-			dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
-	/* Bounds checks. */
-	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
-				"inode 0x%lx or driver bug.", dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* Catch multi sector transfer fixup errors. */
-	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
-		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
-				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
-		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
-				"different from expected VCN (0x%llx). "
-				"Directory inode 0x%lx is corrupt or driver "
-				"bug.", (unsigned long long)
-				sle64_to_cpu(ia->index_block_vcn),
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
-			dir_ni->itype.index.block_size) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx has a size (%u) differing from the "
-				"directory specified size (%u). Directory "
-				"inode is corrupt or driver bug.",
-				(unsigned long long)vcn, dir_ni->mft_no,
-				le32_to_cpu(ia->index.allocated_size) + 0x18,
-				dir_ni->itype.index.block_size);
-		goto unm_err_out;
-	}
-	index_end = (u8*)ia + dir_ni->itype.index.block_size;
-	if (index_end > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx crosses page boundary. Impossible! "
-				"Cannot access! This is probably a bug in the "
-				"driver.", (unsigned long long)vcn,
-				dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
-	if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
-		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
-				"inode 0x%lx exceeds maximum size.",
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ia->index +
-			le32_to_cpu(ia->index.entries_offset));
-	/*
-	 * Iterate similar to above big loop but applied to index buffer, thus
-	 * loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds check. */
-		if ((u8*)ie < (u8*)ia || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end) {
-			ntfs_error(sb, "Index entry out of bounds in "
-					"directory inode 0x%lx.",
-					dir_ni->mft_no);
-			goto unm_err_out;
-		}
-		/*
-		 * The last entry cannot contain a name. It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/*
-		 * We perform a case sensitive comparison and if that matches
-		 * we are done and return the mft reference of the inode (i.e.
-		 * the inode number together with the sequence number for
-		 * consistency checking). We convert it to cpu format before
-		 * returning.
-		 */
-		if (ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
-found_it2:
-			/*
-			 * We have a perfect match, so we don't need to care
-			 * about having matched imperfectly before, so we can
-			 * free name and set *res to NULL.
-			 * However, if the perfect match is a short file name,
-			 * we need to signal this through *res, so that
-			 * ntfs_lookup() can fix dcache aliasing issues.
-			 * As an optimization we just reuse an existing
-			 * allocation of *res.
-			 */
-			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
-				if (!name) {
-					name = kmalloc(sizeof(ntfs_name),
-							GFP_NOFS);
-					if (!name) {
-						err = -ENOMEM;
-						goto unm_err_out;
-					}
-				}
-				name->mref = le64_to_cpu(
-						ie->data.dir.indexed_file);
-				name->type = FILE_NAME_DOS;
-				name->len = 0;
-				*res = name;
-			} else {
-				kfree(name);
-				*res = NULL;
-			}
-			mref = le64_to_cpu(ie->data.dir.indexed_file);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			return mref;
-		}
-		/*
-		 * For a case insensitive mount, we also perform a case
-		 * insensitive comparison (provided the file name is not in the
-		 * POSIX namespace). If the comparison matches, and the name is
-		 * in the WIN32 namespace, we cache the filename in *res so
-		 * that the caller, ntfs_lookup(), can work on it. If the
-		 * comparison matches, and the name is in the DOS namespace, we
-		 * only cache the mft reference and the file name type (we set
-		 * the name length to zero for simplicity).
-		 */
-		if (!NVolCaseSensitive(vol) &&
-				ie->key.file_name.file_name_type &&
-				ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length,
-				IGNORE_CASE, vol->upcase, vol->upcase_len)) {
-			int name_size = sizeof(ntfs_name);
-			u8 type = ie->key.file_name.file_name_type;
-			u8 len = ie->key.file_name.file_name_length;
-
-			/* Only one case insensitive matching name allowed. */
-			if (name) {
-				ntfs_error(sb, "Found already allocated name "
-						"in phase 2. Please run chkdsk "
-						"and if that doesn't find any "
-						"errors please report you saw "
-						"this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net.");
-				unlock_page(page);
-				ntfs_unmap_page(page);
-				goto dir_err_out;
-			}
-
-			if (type != FILE_NAME_DOS)
-				name_size += len * sizeof(ntfschar);
-			name = kmalloc(name_size, GFP_NOFS);
-			if (!name) {
-				err = -ENOMEM;
-				goto unm_err_out;
-			}
-			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
-			name->type = type;
-			if (type != FILE_NAME_DOS) {
-				name->len = len;
-				memcpy(name->name, ie->key.file_name.file_name,
-						len * sizeof(ntfschar));
-			} else
-				name->len = 0;
-			*res = name;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				IGNORE_CASE, vol->upcase, vol->upcase_len);
-		/*
-		 * If uname collates before the name of the current entry, there
-		 * is definitely no such name in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/* The names are not equal, continue the search. */
-		if (rc)
-			continue;
-		/*
-		 * Names match with case insensitive comparison, now try the
-		 * case sensitive comparison, which is required for proper
-		 * collation.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
-		if (rc == -1)
-			break;
-		if (rc)
-			continue;
-		/*
-		 * Perfect match, this will never happen as the
-		 * ntfs_are_names_equal() call will have gotten a match but we
-		 * still treat it correctly.
-		 */
-		goto found_it2;
-	}
-	/*
-	 * We have finished with this index buffer without success. Check for
-	 * the presence of a child node.
-	 */
-	if (ie->flags & INDEX_ENTRY_NODE) {
-		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
-			ntfs_error(sb, "Index entry with child node found in "
-					"a leaf node in directory inode 0x%lx.",
-					dir_ni->mft_no);
-			goto unm_err_out;
-		}
-		/* Child node present, descend into it. */
-		old_vcn = vcn;
-		vcn = sle64_to_cpup((sle64*)((u8*)ie +
-				le16_to_cpu(ie->length) - 8));
-		if (vcn >= 0) {
-			/* If vcn is in the same page cache page as old_vcn we
-			 * recycle the mapped page. */
-			if (old_vcn << vol->cluster_size_bits >>
-					PAGE_SHIFT == vcn <<
-					vol->cluster_size_bits >>
-					PAGE_SHIFT)
-				goto fast_descend_into_child_node;
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto descend_into_child_node;
-		}
-		ntfs_error(sb, "Negative child node vcn in directory inode "
-				"0x%lx.", dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/*
-	 * No child node present, return -ENOENT, unless we have got a matching
-	 * name cached in name in which case return the mft reference
-	 * associated with it.
-	 */
-	if (name) {
-		unlock_page(page);
-		ntfs_unmap_page(page);
-		return name->mref;
-	}
-	ntfs_debug("Entry not found.");
-	err = -ENOENT;
-unm_err_out:
-	unlock_page(page);
-	ntfs_unmap_page(page);
-err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(dir_ni);
-	if (name) {
-		kfree(name);
-		*res = NULL;
-	}
-	return ERR_MREF(err);
-dir_err_out:
-	ntfs_error(sb, "Corrupt directory.  Aborting lookup.");
-	goto err_out;
-}
-
-#if 0
-
-// TODO: (AIA)
-// The algorithm embedded in this code will be required for the time when we
-// want to support adding of entries to directories, where we require correct
-// collation of file names in order not to cause corruption of the filesystem.
-
-/**
- * ntfs_lookup_inode_by_name - find an inode in a directory given its name
- * @dir_ni:	ntfs inode of the directory in which to search for the name
- * @uname:	Unicode name for which to search in the directory
- * @uname_len:	length of the name @uname in Unicode characters
- *
- * Look for an inode with name @uname in the directory with inode @dir_ni.
- * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
- * the Unicode name. If the name is found in the directory, the corresponding
- * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
- * is a 64-bit number containing the sequence number.
- *
- * On error, a negative value is returned corresponding to the error code. In
- * particular if the inode is not found -ENOENT is returned. Note that you
- * can't just check the return value for being negative, you have to check the
- * inode number for being negative which you can extract using MREC(return
- * value).
- *
- * Note, @uname_len does not include the (optional) terminating NULL character.
- */
-u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
-		const int uname_len)
-{
-	ntfs_volume *vol = dir_ni->vol;
-	struct super_block *sb = vol->sb;
-	MFT_RECORD *m;
-	INDEX_ROOT *ir;
-	INDEX_ENTRY *ie;
-	INDEX_ALLOCATION *ia;
-	u8 *index_end;
-	u64 mref;
-	ntfs_attr_search_ctx *ctx;
-	int err, rc;
-	IGNORE_CASE_BOOL ic;
-	VCN vcn, old_vcn;
-	struct address_space *ia_mapping;
-	struct page *page;
-	u8 *kaddr;
-
-	/* Get hold of the mft record for the directory. */
-	m = map_mft_record(dir_ni);
-	if (IS_ERR(m)) {
-		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
-				-PTR_ERR(m));
-		return ERR_MREF(PTR_ERR(m));
-	}
-	ctx = ntfs_attr_get_search_ctx(dir_ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Find the index root attribute in the mft record. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
-			0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			ntfs_error(sb, "Index root attribute missing in "
-					"directory inode 0x%lx.",
-					dir_ni->mft_no);
-			err = -EIO;
-		}
-		goto err_out;
-	}
-	/* Get to the index root value (it's been verified in read_inode). */
-	ir = (INDEX_ROOT*)((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ir->index +
-			le32_to_cpu(ir->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds checks. */
-		if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end)
-			goto dir_err_out;
-		/*
-		 * The last entry cannot contain a name. It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/*
-		 * If the current entry has a name type of POSIX, the name is
-		 * case sensitive and not otherwise. This has the effect of us
-		 * not being able to access any POSIX file names which collate
-		 * after the non-POSIX one when they only differ in case, but
-		 * anyone doing screwy stuff like that deserves to burn in
-		 * hell... Doing that kind of stuff on NT4 actually causes
-		 * corruption on the partition even when using SP6a and Linux
-		 * is not involved at all.
-		 */
-		ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
-				CASE_SENSITIVE;
-		/*
-		 * If the names match perfectly, we are done and return the
-		 * mft reference of the inode (i.e. the inode number together
-		 * with the sequence number for consistency checking. We
-		 * convert it to cpu format before returning.
-		 */
-		if (ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, ic,
-				vol->upcase, vol->upcase_len)) {
-found_it:
-			mref = le64_to_cpu(ie->data.dir.indexed_file);
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(dir_ni);
-			return mref;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				IGNORE_CASE, vol->upcase, vol->upcase_len);
-		/*
-		 * If uname collates before the name of the current entry, there
-		 * is definitely no such name in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/* The names are not equal, continue the search. */
-		if (rc)
-			continue;
-		/*
-		 * Names match with case insensitive comparison, now try the
-		 * case sensitive comparison, which is required for proper
-		 * collation.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
-		if (rc == -1)
-			break;
-		if (rc)
-			continue;
-		/*
-		 * Perfect match, this will never happen as the
-		 * ntfs_are_names_equal() call will have gotten a match but we
-		 * still treat it correctly.
-		 */
-		goto found_it;
-	}
-	/*
-	 * We have finished with this index without success. Check for the
-	 * presence of a child node.
-	 */
-	if (!(ie->flags & INDEX_ENTRY_NODE)) {
-		/* No child node, return -ENOENT. */
-		err = -ENOENT;
-		goto err_out;
-	} /* Child node present, descend into it. */
-	/* Consistency check: Verify that an index allocation exists. */
-	if (!NInoIndexAllocPresent(dir_ni)) {
-		ntfs_error(sb, "No index allocation attribute but index entry "
-				"requires one. Directory inode 0x%lx is "
-				"corrupt or driver bug.", dir_ni->mft_no);
-		goto err_out;
-	}
-	/* Get the starting vcn of the index_block holding the child node. */
-	vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
-	ia_mapping = VFS_I(dir_ni)->i_mapping;
-	/*
-	 * We are done with the index root and the mft record. Release them,
-	 * otherwise we deadlock with ntfs_map_page().
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(dir_ni);
-	m = NULL;
-	ctx = NULL;
-descend_into_child_node:
-	/*
-	 * Convert vcn to index into the index allocation attribute in units
-	 * of PAGE_SIZE and map the page cache page, reading it from
-	 * disk if necessary.
-	 */
-	page = ntfs_map_page(ia_mapping, vcn <<
-			dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
-	if (IS_ERR(page)) {
-		ntfs_error(sb, "Failed to map directory index page, error %ld.",
-				-PTR_ERR(page));
-		err = PTR_ERR(page);
-		goto err_out;
-	}
-	lock_page(page);
-	kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
-	/* Get to the index allocation block. */
-	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
-			dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
-	/* Bounds checks. */
-	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
-				"inode 0x%lx or driver bug.", dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* Catch multi sector transfer fixup errors. */
-	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
-		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
-				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
-		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
-				"different from expected VCN (0x%llx). "
-				"Directory inode 0x%lx is corrupt or driver "
-				"bug.", (unsigned long long)
-				sle64_to_cpu(ia->index_block_vcn),
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
-			dir_ni->itype.index.block_size) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx has a size (%u) differing from the "
-				"directory specified size (%u). Directory "
-				"inode is corrupt or driver bug.",
-				(unsigned long long)vcn, dir_ni->mft_no,
-				le32_to_cpu(ia->index.allocated_size) + 0x18,
-				dir_ni->itype.index.block_size);
-		goto unm_err_out;
-	}
-	index_end = (u8*)ia + dir_ni->itype.index.block_size;
-	if (index_end > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx crosses page boundary. Impossible! "
-				"Cannot access! This is probably a bug in the "
-				"driver.", (unsigned long long)vcn,
-				dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
-	if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
-		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
-				"inode 0x%lx exceeds maximum size.",
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ia->index +
-			le32_to_cpu(ia->index.entries_offset));
-	/*
-	 * Iterate similar to above big loop but applied to index buffer, thus
-	 * loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds check. */
-		if ((u8*)ie < (u8*)ia || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end) {
-			ntfs_error(sb, "Index entry out of bounds in "
-					"directory inode 0x%lx.",
-					dir_ni->mft_no);
-			goto unm_err_out;
-		}
-		/*
-		 * The last entry cannot contain a name. It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/*
-		 * If the current entry has a name type of POSIX, the name is
-		 * case sensitive and not otherwise. This has the effect of us
-		 * not being able to access any POSIX file names which collate
-		 * after the non-POSIX one when they only differ in case, but
-		 * anyone doing screwy stuff like that deserves to burn in
-		 * hell... Doing that kind of stuff on NT4 actually causes
-		 * corruption on the partition even when using SP6a and Linux
-		 * is not involved at all.
-		 */
-		ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
-				CASE_SENSITIVE;
-		/*
-		 * If the names match perfectly, we are done and return the
-		 * mft reference of the inode (i.e. the inode number together
-		 * with the sequence number for consistency checking. We
-		 * convert it to cpu format before returning.
-		 */
-		if (ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, ic,
-				vol->upcase, vol->upcase_len)) {
-found_it2:
-			mref = le64_to_cpu(ie->data.dir.indexed_file);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			return mref;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				IGNORE_CASE, vol->upcase, vol->upcase_len);
-		/*
-		 * If uname collates before the name of the current entry, there
-		 * is definitely no such name in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/* The names are not equal, continue the search. */
-		if (rc)
-			continue;
-		/*
-		 * Names match with case insensitive comparison, now try the
-		 * case sensitive comparison, which is required for proper
-		 * collation.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
-		if (rc == -1)
-			break;
-		if (rc)
-			continue;
-		/*
-		 * Perfect match, this will never happen as the
-		 * ntfs_are_names_equal() call will have gotten a match but we
-		 * still treat it correctly.
-		 */
-		goto found_it2;
-	}
-	/*
-	 * We have finished with this index buffer without success. Check for
-	 * the presence of a child node.
-	 */
-	if (ie->flags & INDEX_ENTRY_NODE) {
-		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
-			ntfs_error(sb, "Index entry with child node found in "
-					"a leaf node in directory inode 0x%lx.",
-					dir_ni->mft_no);
-			goto unm_err_out;
-		}
-		/* Child node present, descend into it. */
-		old_vcn = vcn;
-		vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
-		if (vcn >= 0) {
-			/* If vcn is in the same page cache page as old_vcn we
-			 * recycle the mapped page. */
-			if (old_vcn << vol->cluster_size_bits >>
-					PAGE_SHIFT == vcn <<
-					vol->cluster_size_bits >>
-					PAGE_SHIFT)
-				goto fast_descend_into_child_node;
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto descend_into_child_node;
-		}
-		ntfs_error(sb, "Negative child node vcn in directory inode "
-				"0x%lx.", dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* No child node, return -ENOENT. */
-	ntfs_debug("Entry not found.");
-	err = -ENOENT;
-unm_err_out:
-	unlock_page(page);
-	ntfs_unmap_page(page);
-err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(dir_ni);
-	return ERR_MREF(err);
-dir_err_out:
-	ntfs_error(sb, "Corrupt directory. Aborting lookup.");
-	goto err_out;
-}
-
-#endif
-
-/**
- * ntfs_filldir - ntfs specific filldir method
- * @vol:	current ntfs volume
- * @ndir:	ntfs inode of current directory
- * @ia_page:	page in which the index allocation buffer @ie is in resides
- * @ie:		current index entry
- * @name:	buffer to use for the converted name
- * @actor:	what to feed the entries to
- *
- * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
- * callback.
- *
- * If @ia_page is not NULL it is the locked page containing the index
- * allocation block containing the index entry @ie.
- *
- * Note, we drop (and then reacquire) the page lock on @ia_page across the
- * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
- * since ntfs_lookup() will lock the same page.  As an optimization, we do not
- * retake the lock if we are returning a non-zero value as ntfs_readdir()
- * would need to drop the lock immediately anyway.
- */
-static inline int ntfs_filldir(ntfs_volume *vol,
-		ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
-		u8 *name, struct dir_context *actor)
-{
-	unsigned long mref;
-	int name_len;
-	unsigned dt_type;
-	FILE_NAME_TYPE_FLAGS name_type;
-
-	name_type = ie->key.file_name.file_name_type;
-	if (name_type == FILE_NAME_DOS) {
-		ntfs_debug("Skipping DOS name space entry.");
-		return 0;
-	}
-	if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
-		ntfs_debug("Skipping root directory self reference entry.");
-		return 0;
-	}
-	if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
-			!NVolShowSystemFiles(vol)) {
-		ntfs_debug("Skipping system file.");
-		return 0;
-	}
-	name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
-			ie->key.file_name.file_name_length, &name,
-			NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
-	if (name_len <= 0) {
-		ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
-				(long long)MREF_LE(ie->data.dir.indexed_file));
-		return 0;
-	}
-	if (ie->key.file_name.file_attributes &
-			FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
-		dt_type = DT_DIR;
-	else
-		dt_type = DT_REG;
-	mref = MREF_LE(ie->data.dir.indexed_file);
-	/*
-	 * Drop the page lock otherwise we deadlock with NFS when it calls
-	 * ->lookup since ntfs_lookup() will lock the same page.
-	 */
-	if (ia_page)
-		unlock_page(ia_page);
-	ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
-			"0x%lx, DT_%s.", name, name_len, actor->pos, mref,
-			dt_type == DT_DIR ? "DIR" : "REG");
-	if (!dir_emit(actor, name, name_len, mref, dt_type))
-		return 1;
-	/* Relock the page but not if we are aborting ->readdir. */
-	if (ia_page)
-		lock_page(ia_page);
-	return 0;
-}
-
-/*
- * We use the same basic approach as the old NTFS driver, i.e. we parse the
- * index root entries and then the index allocation entries that are marked
- * as in use in the index bitmap.
- *
- * While this will return the names in random order this doesn't matter for
- * ->readdir but OTOH results in a faster ->readdir.
- *
- * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
- * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
- * modifications).
- *
- * Locking:  - Caller must hold i_mutex on the directory.
- *	     - Each page cache page in the index allocation mapping must be
- *	       locked whilst being accessed otherwise we may find a corrupt
- *	       page due to it being under ->writepage at the moment which
- *	       applies the mst protection fixups before writing out and then
- *	       removes them again after the write is complete after which it 
- *	       unlocks the page.
- */
-static int ntfs_readdir(struct file *file, struct dir_context *actor)
-{
-	s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
-	loff_t i_size;
-	struct inode *bmp_vi, *vdir = file_inode(file);
-	struct super_block *sb = vdir->i_sb;
-	ntfs_inode *ndir = NTFS_I(vdir);
-	ntfs_volume *vol = NTFS_SB(sb);
-	MFT_RECORD *m;
-	INDEX_ROOT *ir = NULL;
-	INDEX_ENTRY *ie;
-	INDEX_ALLOCATION *ia;
-	u8 *name = NULL;
-	int rc, err, ir_pos, cur_bmp_pos;
-	struct address_space *ia_mapping, *bmp_mapping;
-	struct page *bmp_page = NULL, *ia_page = NULL;
-	u8 *kaddr, *bmp, *index_end;
-	ntfs_attr_search_ctx *ctx;
-
-	ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
-			vdir->i_ino, actor->pos);
-	rc = err = 0;
-	/* Are we at end of dir yet? */
-	i_size = i_size_read(vdir);
-	if (actor->pos >= i_size + vol->mft_record_size)
-		return 0;
-	/* Emulate . and .. for all directories. */
-	if (!dir_emit_dots(file, actor))
-		return 0;
-	m = NULL;
-	ctx = NULL;
-	/*
-	 * Allocate a buffer to store the current name being processed
-	 * converted to format determined by current NLS.
-	 */
-	name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
-	if (unlikely(!name)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Are we jumping straight into the index allocation attribute? */
-	if (actor->pos >= vol->mft_record_size)
-		goto skip_index_root;
-	/* Get hold of the mft record for the directory. */
-	m = map_mft_record(ndir);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(ndir, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Get the offset into the index root attribute. */
-	ir_pos = (s64)actor->pos;
-	/* Find the index root attribute in the mft record. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
-			0, ctx);
-	if (unlikely(err)) {
-		ntfs_error(sb, "Index root attribute missing in directory "
-				"inode 0x%lx.", vdir->i_ino);
-		goto err_out;
-	}
-	/*
-	 * Copy the index root attribute value to a buffer so that we can put
-	 * the search context and unmap the mft record before calling the
-	 * filldir() callback.  We need to do this because of NFSd which calls
-	 * ->lookup() from its filldir callback() and this causes NTFS to
-	 * deadlock as ntfs_lookup() maps the mft record of the directory and
-	 * we have got it mapped here already.  The only solution is for us to
-	 * unmap the mft record here so that a call to ntfs_lookup() is able to
-	 * map the mft record without deadlocking.
-	 */
-	rc = le32_to_cpu(ctx->attr->data.resident.value_length);
-	ir = kmalloc(rc, GFP_NOFS);
-	if (unlikely(!ir)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Copy the index root value (it has been verified in read_inode). */
-	memcpy(ir, (u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ndir);
-	ctx = NULL;
-	m = NULL;
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ir->index +
-			le32_to_cpu(ir->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry or until filldir tells us it has had enough
-	 * or signals an error (both covered by the rc test).
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
-		/* Bounds checks. */
-		if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end))
-			goto err_out;
-		/* The last entry cannot contain a name. */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/* Skip index root entry if continuing previous readdir. */
-		if (ir_pos > (u8*)ie - (u8*)ir)
-			continue;
-		/* Advance the position even if going to skip the entry. */
-		actor->pos = (u8*)ie - (u8*)ir;
-		/* Submit the name to the filldir callback. */
-		rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
-		if (rc) {
-			kfree(ir);
-			goto abort;
-		}
-	}
-	/* We are done with the index root and can free the buffer. */
-	kfree(ir);
-	ir = NULL;
-	/* If there is no index allocation attribute we are finished. */
-	if (!NInoIndexAllocPresent(ndir))
-		goto EOD;
-	/* Advance fpos to the beginning of the index allocation. */
-	actor->pos = vol->mft_record_size;
-skip_index_root:
-	kaddr = NULL;
-	prev_ia_pos = -1LL;
-	/* Get the offset into the index allocation attribute. */
-	ia_pos = (s64)actor->pos - vol->mft_record_size;
-	ia_mapping = vdir->i_mapping;
-	ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
-	bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
-	if (IS_ERR(bmp_vi)) {
-		ntfs_error(sb, "Failed to get bitmap attribute.");
-		err = PTR_ERR(bmp_vi);
-		goto err_out;
-	}
-	bmp_mapping = bmp_vi->i_mapping;
-	/* Get the starting bitmap bit position and sanity check it. */
-	bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
-	if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) {
-		ntfs_error(sb, "Current index allocation position exceeds "
-				"index bitmap size.");
-		goto iput_err_out;
-	}
-	/* Get the starting bit position in the current bitmap page. */
-	cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1);
-	bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1);
-get_next_bmp_page:
-	ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
-			(unsigned long long)bmp_pos >> (3 + PAGE_SHIFT),
-			(unsigned long long)bmp_pos &
-			(unsigned long long)((PAGE_SIZE * 8) - 1));
-	bmp_page = ntfs_map_page(bmp_mapping,
-			bmp_pos >> (3 + PAGE_SHIFT));
-	if (IS_ERR(bmp_page)) {
-		ntfs_error(sb, "Reading index bitmap failed.");
-		err = PTR_ERR(bmp_page);
-		bmp_page = NULL;
-		goto iput_err_out;
-	}
-	bmp = (u8*)page_address(bmp_page);
-	/* Find next index block in use. */
-	while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) {
-find_next_index_buffer:
-		cur_bmp_pos++;
-		/*
-		 * If we have reached the end of the bitmap page, get the next
-		 * page, and put away the old one.
-		 */
-		if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) {
-			ntfs_unmap_page(bmp_page);
-			bmp_pos += PAGE_SIZE * 8;
-			cur_bmp_pos = 0;
-			goto get_next_bmp_page;
-		}
-		/* If we have reached the end of the bitmap, we are done. */
-		if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size))
-			goto unm_EOD;
-		ia_pos = (bmp_pos + cur_bmp_pos) <<
-				ndir->itype.index.block_size_bits;
-	}
-	ntfs_debug("Handling index buffer 0x%llx.",
-			(unsigned long long)bmp_pos + cur_bmp_pos);
-	/* If the current index buffer is in the same page we reuse the page. */
-	if ((prev_ia_pos & (s64)PAGE_MASK) !=
-			(ia_pos & (s64)PAGE_MASK)) {
-		prev_ia_pos = ia_pos;
-		if (likely(ia_page != NULL)) {
-			unlock_page(ia_page);
-			ntfs_unmap_page(ia_page);
-		}
-		/*
-		 * Map the page cache page containing the current ia_pos,
-		 * reading it from disk if necessary.
-		 */
-		ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT);
-		if (IS_ERR(ia_page)) {
-			ntfs_error(sb, "Reading index allocation data failed.");
-			err = PTR_ERR(ia_page);
-			ia_page = NULL;
-			goto err_out;
-		}
-		lock_page(ia_page);
-		kaddr = (u8*)page_address(ia_page);
-	}
-	/* Get the current index buffer. */
-	ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK &
-					  ~(s64)(ndir->itype.index.block_size - 1)));
-	/* Bounds checks. */
-	if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) {
-		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
-				"inode 0x%lx or driver bug.", vdir->i_ino);
-		goto err_out;
-	}
-	/* Catch multi sector transfer fixup errors. */
-	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
-		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
-				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
-				(unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino);
-		goto err_out;
-	}
-	if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
-			~(s64)(ndir->itype.index.block_size - 1)) >>
-			ndir->itype.index.vcn_size_bits)) {
-		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
-				"different from expected VCN (0x%llx). "
-				"Directory inode 0x%lx is corrupt or driver "
-				"bug. ", (unsigned long long)
-				sle64_to_cpu(ia->index_block_vcn),
-				(unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino);
-		goto err_out;
-	}
-	if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 !=
-			ndir->itype.index.block_size)) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx has a size (%u) differing from the "
-				"directory specified size (%u). Directory "
-				"inode is corrupt or driver bug.",
-				(unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino,
-				le32_to_cpu(ia->index.allocated_size) + 0x18,
-				ndir->itype.index.block_size);
-		goto err_out;
-	}
-	index_end = (u8*)ia + ndir->itype.index.block_size;
-	if (unlikely(index_end > kaddr + PAGE_SIZE)) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx crosses page boundary. Impossible! "
-				"Cannot access! This is probably a bug in the "
-				"driver.", (unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino);
-		goto err_out;
-	}
-	ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1);
-	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
-	if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) {
-		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
-				"inode 0x%lx exceeds maximum size.",
-				(unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino);
-		goto err_out;
-	}
-	/* The first index entry in this index buffer. */
-	ie = (INDEX_ENTRY*)((u8*)&ia->index +
-			le32_to_cpu(ia->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry or until filldir tells us it has had enough
-	 * or signals an error (both covered by the rc test).
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		ntfs_debug("In index allocation, offset 0x%llx.",
-				(unsigned long long)ia_start +
-				(unsigned long long)((u8*)ie - (u8*)ia));
-		/* Bounds checks. */
-		if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end))
-			goto err_out;
-		/* The last entry cannot contain a name. */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/* Skip index block entry if continuing previous readdir. */
-		if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
-			continue;
-		/* Advance the position even if going to skip the entry. */
-		actor->pos = (u8*)ie - (u8*)ia +
-				(sle64_to_cpu(ia->index_block_vcn) <<
-				ndir->itype.index.vcn_size_bits) +
-				vol->mft_record_size;
-		/*
-		 * Submit the name to the @filldir callback.  Note,
-		 * ntfs_filldir() drops the lock on @ia_page but it retakes it
-		 * before returning, unless a non-zero value is returned in
-		 * which case the page is left unlocked.
-		 */
-		rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
-		if (rc) {
-			/* @ia_page is already unlocked in this case. */
-			ntfs_unmap_page(ia_page);
-			ntfs_unmap_page(bmp_page);
-			iput(bmp_vi);
-			goto abort;
-		}
-	}
-	goto find_next_index_buffer;
-unm_EOD:
-	if (ia_page) {
-		unlock_page(ia_page);
-		ntfs_unmap_page(ia_page);
-	}
-	ntfs_unmap_page(bmp_page);
-	iput(bmp_vi);
-EOD:
-	/* We are finished, set fpos to EOD. */
-	actor->pos = i_size + vol->mft_record_size;
-abort:
-	kfree(name);
-	return 0;
-err_out:
-	if (bmp_page) {
-		ntfs_unmap_page(bmp_page);
-iput_err_out:
-		iput(bmp_vi);
-	}
-	if (ia_page) {
-		unlock_page(ia_page);
-		ntfs_unmap_page(ia_page);
-	}
-	kfree(ir);
-	kfree(name);
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(ndir);
-	if (!err)
-		err = -EIO;
-	ntfs_debug("Failed. Returning error code %i.", -err);
-	return err;
-}
-
-/**
- * ntfs_dir_open - called when an inode is about to be opened
- * @vi:		inode to be opened
- * @filp:	file structure describing the inode
- *
- * Limit directory size to the page cache limit on architectures where unsigned
- * long is 32-bits. This is the most we can do for now without overflowing the
- * page cache page index. Doing it this way means we don't run into problems
- * because of existing too large directories. It would be better to allow the
- * user to read the accessible part of the directory but I doubt very much
- * anyone is going to hit this check on a 32-bit architecture, so there is no
- * point in adding the extra complexity required to support this.
- *
- * On 64-bit architectures, the check is hopefully optimized away by the
- * compiler.
- */
-static int ntfs_dir_open(struct inode *vi, struct file *filp)
-{
-	if (sizeof(unsigned long) < 8) {
-		if (i_size_read(vi) > MAX_LFS_FILESIZE)
-			return -EFBIG;
-	}
-	return 0;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_dir_fsync - sync a directory to disk
- * @filp:	directory to be synced
- * @start:	offset in bytes of the beginning of data range to sync
- * @end:	offset in bytes of the end of data range (inclusive)
- * @datasync:	if non-zero only flush user data and not metadata
- *
- * Data integrity sync of a directory to disk.  Used for fsync, fdatasync, and
- * msync system calls.  This function is based on file.c::ntfs_file_fsync().
- *
- * Write the mft record and all associated extent mft records as well as the
- * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
- *
- * If @datasync is true, we do not wait on the inode(s) to be written out
- * but we always wait on the page cache pages to be written out.
- *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
- * Locking: Caller must hold i_mutex on the inode.
- *
- * TODO: We should probably also write all attribute/index inodes associated
- * with this inode but since we have no simple way of getting to them we ignore
- * this problem for now.  We do write the $BITMAP attribute if it is present
- * which is the important one for a directory so things are not too bad.
- */
-static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
-			  int datasync)
-{
-	struct inode *bmp_vi, *vi = filp->f_mapping->host;
-	int err, ret;
-	ntfs_attr na;
-
-	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-
-	err = file_write_and_wait_range(filp, start, end);
-	if (err)
-		return err;
-	inode_lock(vi);
-
-	BUG_ON(!S_ISDIR(vi->i_mode));
-	/* If the bitmap attribute inode is in memory sync it, too. */
-	na.mft_no = vi->i_ino;
-	na.type = AT_BITMAP;
-	na.name = I30;
-	na.name_len = 4;
-	bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na);
-	if (bmp_vi) {
- 		write_inode_now(bmp_vi, !datasync);
-		iput(bmp_vi);
-	}
-	ret = __ntfs_write_inode(vi, 1);
-	write_inode_now(vi, !datasync);
-	err = sync_blockdev(vi->i_sb->s_bdev);
-	if (unlikely(err && !ret))
-		ret = err;
-	if (likely(!ret))
-		ntfs_debug("Done.");
-	else
-		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
-				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
-	inode_unlock(vi);
-	return ret;
-}
-
-#endif /* NTFS_RW */
-
-WRAP_DIR_ITER(ntfs_readdir) // FIXME!
-const struct file_operations ntfs_dir_ops = {
-	.llseek		= generic_file_llseek,	/* Seek inside directory. */
-	.read		= generic_read_dir,	/* Return -EISDIR. */
-	.iterate_shared	= shared_ntfs_readdir,	/* Read directory contents. */
-#ifdef NTFS_RW
-	.fsync		= ntfs_dir_fsync,	/* Sync a directory to disk. */
-#endif /* NTFS_RW */
-	/*.ioctl	= ,*/			/* Perform function on the
-						   mounted filesystem. */
-	.open		= ntfs_dir_open,	/* Open directory. */
-};
diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
deleted file mode 100644
index 0e326753df40..000000000000
--- a/fs/ntfs/dir.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
- *	   the Linux-NTFS project.
- *
- * Copyright (c) 2002-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_DIR_H
-#define _LINUX_NTFS_DIR_H
-
-#include "layout.h"
-#include "inode.h"
-#include "types.h"
-
-/*
- * ntfs_name is used to return the file name to the caller of
- * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup())
- * to be able to deal with dcache aliasing issues.
- */
-typedef struct {
-	MFT_REF mref;
-	FILE_NAME_TYPE_FLAGS type;
-	u8 len;
-	ntfschar name[0];
-} __attribute__ ((__packed__)) ntfs_name;
-
-/* The little endian Unicode string $I30 as a global constant. */
-extern ntfschar I30[5];
-
-extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni,
-		const ntfschar *uname, const int uname_len, ntfs_name **res);
-
-#endif /* _LINUX_NTFS_FS_DIR_H */
diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h
deleted file mode 100644
index f30c139bf9ae..000000000000
--- a/fs/ntfs/endian.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
- *	      Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_ENDIAN_H
-#define _LINUX_NTFS_ENDIAN_H
-
-#include <asm/byteorder.h>
-#include "types.h"
-
-/*
- * Signed endianness conversion functions.
- */
-
-static inline s16 sle16_to_cpu(sle16 x)
-{
-	return le16_to_cpu((__force le16)x);
-}
-
-static inline s32 sle32_to_cpu(sle32 x)
-{
-	return le32_to_cpu((__force le32)x);
-}
-
-static inline s64 sle64_to_cpu(sle64 x)
-{
-	return le64_to_cpu((__force le64)x);
-}
-
-static inline s16 sle16_to_cpup(sle16 *x)
-{
-	return le16_to_cpu(*(__force le16*)x);
-}
-
-static inline s32 sle32_to_cpup(sle32 *x)
-{
-	return le32_to_cpu(*(__force le32*)x);
-}
-
-static inline s64 sle64_to_cpup(sle64 *x)
-{
-	return le64_to_cpu(*(__force le64*)x);
-}
-
-static inline sle16 cpu_to_sle16(s16 x)
-{
-	return (__force sle16)cpu_to_le16(x);
-}
-
-static inline sle32 cpu_to_sle32(s32 x)
-{
-	return (__force sle32)cpu_to_le32(x);
-}
-
-static inline sle64 cpu_to_sle64(s64 x)
-{
-	return (__force sle64)cpu_to_le64(x);
-}
-
-static inline sle16 cpu_to_sle16p(s16 *x)
-{
-	return (__force sle16)cpu_to_le16(*x);
-}
-
-static inline sle32 cpu_to_sle32p(s32 *x)
-{
-	return (__force sle32)cpu_to_le32(*x);
-}
-
-static inline sle64 cpu_to_sle64p(s64 *x)
-{
-	return (__force sle64)cpu_to_le64(*x);
-}
-
-#endif /* _LINUX_NTFS_ENDIAN_H */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
deleted file mode 100644
index 297c0b9db621..000000000000
--- a/fs/ntfs/file.c
+++ /dev/null
@@ -1,1997 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
- */
-
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/pagevec.h>
-#include <linux/sched/signal.h>
-#include <linux/swap.h>
-#include <linux/uio.h>
-#include <linux/writeback.h>
-
-#include <asm/page.h>
-#include <linux/uaccess.h>
-
-#include "attrib.h"
-#include "bitmap.h"
-#include "inode.h"
-#include "debug.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-
-/**
- * ntfs_file_open - called when an inode is about to be opened
- * @vi:		inode to be opened
- * @filp:	file structure describing the inode
- *
- * Limit file size to the page cache limit on architectures where unsigned long
- * is 32-bits. This is the most we can do for now without overflowing the page
- * cache page index. Doing it this way means we don't run into problems because
- * of existing too large files. It would be better to allow the user to read
- * the beginning of the file but I doubt very much anyone is going to hit this
- * check on a 32-bit architecture, so there is no point in adding the extra
- * complexity required to support this.
- *
- * On 64-bit architectures, the check is hopefully optimized away by the
- * compiler.
- *
- * After the check passes, just call generic_file_open() to do its work.
- */
-static int ntfs_file_open(struct inode *vi, struct file *filp)
-{
-	if (sizeof(unsigned long) < 8) {
-		if (i_size_read(vi) > MAX_LFS_FILESIZE)
-			return -EOVERFLOW;
-	}
-	return generic_file_open(vi, filp);
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_attr_extend_initialized - extend the initialized size of an attribute
- * @ni:			ntfs inode of the attribute to extend
- * @new_init_size:	requested new initialized size in bytes
- *
- * Extend the initialized size of an attribute described by the ntfs inode @ni
- * to @new_init_size bytes.  This involves zeroing any non-sparse space between
- * the old initialized size and @new_init_size both in the page cache and on
- * disk (if relevant complete pages are already uptodate in the page cache then
- * these are simply marked dirty).
- *
- * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
- * in the resident attribute case, it is tied to the initialized size and, in
- * the non-resident attribute case, it may not fall below the initialized size.
- *
- * Note that if the attribute is resident, we do not need to touch the page
- * cache at all.  This is because if the page cache page is not uptodate we
- * bring it uptodate later, when doing the write to the mft record since we
- * then already have the page mapped.  And if the page is uptodate, the
- * non-initialized region will already have been zeroed when the page was
- * brought uptodate and the region may in fact already have been overwritten
- * with new data via mmap() based writes, so we cannot just zero it.  And since
- * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
- * is unspecified, we choose not to do zeroing and thus we do not need to touch
- * the page at all.  For a more detailed explanation see ntfs_truncate() in
- * fs/ntfs/inode.c.
- *
- * Return 0 on success and -errno on error.  In the case that an error is
- * encountered it is possible that the initialized size will already have been
- * incremented some way towards @new_init_size but it is guaranteed that if
- * this is the case, the necessary zeroing will also have happened and that all
- * metadata is self-consistent.
- *
- * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
- *	    held by the caller.
- */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
-{
-	s64 old_init_size;
-	loff_t old_i_size;
-	pgoff_t index, end_index;
-	unsigned long flags;
-	struct inode *vi = VFS_I(ni);
-	ntfs_inode *base_ni;
-	MFT_RECORD *m = NULL;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx = NULL;
-	struct address_space *mapping;
-	struct page *page = NULL;
-	u8 *kattr;
-	int err;
-	u32 attr_len;
-
-	read_lock_irqsave(&ni->size_lock, flags);
-	old_init_size = ni->initialized_size;
-	old_i_size = i_size_read(vi);
-	BUG_ON(new_init_size > ni->allocated_size);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
-			"old_initialized_size 0x%llx, "
-			"new_initialized_size 0x%llx, i_size 0x%llx.",
-			vi->i_ino, (unsigned)le32_to_cpu(ni->type),
-			(unsigned long long)old_init_size,
-			(unsigned long long)new_init_size, old_i_size);
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/* Use goto to reduce indentation and we need the label below anyway. */
-	if (NInoNonResident(ni))
-		goto do_non_resident_extend;
-	BUG_ON(old_init_size != old_i_size);
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	BUG_ON(a->non_resident);
-	/* The total length of the attribute value. */
-	attr_len = le32_to_cpu(a->data.resident.value_length);
-	BUG_ON(old_i_size != (loff_t)attr_len);
-	/*
-	 * Do the zeroing in the mft record and update the attribute size in
-	 * the mft record.
-	 */
-	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
-	memset(kattr + attr_len, 0, new_init_size - attr_len);
-	a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
-	/* Finally, update the sizes in the vfs and ntfs inodes. */
-	write_lock_irqsave(&ni->size_lock, flags);
-	i_size_write(vi, new_init_size);
-	ni->initialized_size = new_init_size;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	goto done;
-do_non_resident_extend:
-	/*
-	 * If the new initialized size @new_init_size exceeds the current file
-	 * size (vfs inode->i_size), we need to extend the file size to the
-	 * new initialized size.
-	 */
-	if (new_init_size > old_i_size) {
-		m = map_mft_record(base_ni);
-		if (IS_ERR(m)) {
-			err = PTR_ERR(m);
-			m = NULL;
-			goto err_out;
-		}
-		ctx = ntfs_attr_get_search_ctx(base_ni, m);
-		if (unlikely(!ctx)) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, 0, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT)
-				err = -EIO;
-			goto err_out;
-		}
-		m = ctx->mrec;
-		a = ctx->attr;
-		BUG_ON(!a->non_resident);
-		BUG_ON(old_i_size != (loff_t)
-				sle64_to_cpu(a->data.non_resident.data_size));
-		a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		/* Update the file size in the vfs inode. */
-		i_size_write(vi, new_init_size);
-		ntfs_attr_put_search_ctx(ctx);
-		ctx = NULL;
-		unmap_mft_record(base_ni);
-		m = NULL;
-	}
-	mapping = vi->i_mapping;
-	index = old_init_size >> PAGE_SHIFT;
-	end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	do {
-		/*
-		 * Read the page.  If the page is not present, this will zero
-		 * the uninitialized regions for us.
-		 */
-		page = read_mapping_page(mapping, index, NULL);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto init_err_out;
-		}
-		/*
-		 * Update the initialized size in the ntfs inode.  This is
-		 * enough to make ntfs_writepage() work.
-		 */
-		write_lock_irqsave(&ni->size_lock, flags);
-		ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
-		if (ni->initialized_size > new_init_size)
-			ni->initialized_size = new_init_size;
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		/* Set the page dirty so it gets written out. */
-		set_page_dirty(page);
-		put_page(page);
-		/*
-		 * Play nice with the vm and the rest of the system.  This is
-		 * very much needed as we can potentially be modifying the
-		 * initialised size from a very small value to a really huge
-		 * value, e.g.
-		 *	f = open(somefile, O_TRUNC);
-		 *	truncate(f, 10GiB);
-		 *	seek(f, 10GiB);
-		 *	write(f, 1);
-		 * And this would mean we would be marking dirty hundreds of
-		 * thousands of pages or as in the above example more than
-		 * two and a half million pages!
-		 *
-		 * TODO: For sparse pages could optimize this workload by using
-		 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit.  This
-		 * would be set in read_folio for sparse pages and here we would
-		 * not need to mark dirty any pages which have this bit set.
-		 * The only caveat is that we have to clear the bit everywhere
-		 * where we allocate any clusters that lie in the page or that
-		 * contain the page.
-		 *
-		 * TODO: An even greater optimization would be for us to only
-		 * call read_folio() on pages which are not in sparse regions as
-		 * determined from the runlist.  This would greatly reduce the
-		 * number of pages we read and make dirty in the case of sparse
-		 * files.
-		 */
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-	} while (++index < end_index);
-	read_lock_irqsave(&ni->size_lock, flags);
-	BUG_ON(ni->initialized_size != new_init_size);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	/* Now bring in sync the initialized_size in the mft record. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		goto init_err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto init_err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto init_err_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	BUG_ON(!a->non_resident);
-	a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
-done:
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
-			(unsigned long long)new_init_size, i_size_read(vi));
-	return 0;
-init_err_out:
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->initialized_size = old_init_size;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	ntfs_debug("Failed.  Returning error code %i.", err);
-	return err;
-}
-
-static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
-		struct iov_iter *from)
-{
-	loff_t pos;
-	s64 end, ll;
-	ssize_t err;
-	unsigned long flags;
-	struct file *file = iocb->ki_filp;
-	struct inode *vi = file_inode(file);
-	ntfs_inode *ni = NTFS_I(vi);
-	ntfs_volume *vol = ni->vol;
-
-	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
-			"0x%llx, count 0x%zx.", vi->i_ino,
-			(unsigned)le32_to_cpu(ni->type),
-			(unsigned long long)iocb->ki_pos,
-			iov_iter_count(from));
-	err = generic_write_checks(iocb, from);
-	if (unlikely(err <= 0))
-		goto out;
-	/*
-	 * All checks have passed.  Before we start doing any writing we want
-	 * to abort any totally illegal writes.
-	 */
-	BUG_ON(NInoMstProtected(ni));
-	BUG_ON(ni->type != AT_DATA);
-	/* If file is encrypted, deny access, just like NT4. */
-	if (NInoEncrypted(ni)) {
-		/* Only $DATA attributes can be encrypted. */
-		/*
-		 * Reminder for later: Encrypted files are _always_
-		 * non-resident so that the content can always be encrypted.
-		 */
-		ntfs_debug("Denying write access to encrypted file.");
-		err = -EACCES;
-		goto out;
-	}
-	if (NInoCompressed(ni)) {
-		/* Only unnamed $DATA attribute can be compressed. */
-		BUG_ON(ni->name_len);
-		/*
-		 * Reminder for later: If resident, the data is not actually
-		 * compressed.  Only on the switch to non-resident does
-		 * compression kick in.  This is in contrast to encrypted files
-		 * (see above).
-		 */
-		ntfs_error(vi->i_sb, "Writing to compressed files is not "
-				"implemented yet.  Sorry.");
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-	err = file_remove_privs(file);
-	if (unlikely(err))
-		goto out;
-	/*
-	 * Our ->update_time method always succeeds thus file_update_time()
-	 * cannot fail either so there is no need to check the return code.
-	 */
-	file_update_time(file);
-	pos = iocb->ki_pos;
-	/* The first byte after the last cluster being written to. */
-	end = (pos + iov_iter_count(from) + vol->cluster_size_mask) &
-			~(u64)vol->cluster_size_mask;
-	/*
-	 * If the write goes beyond the allocated size, extend the allocation
-	 * to cover the whole of the write, rounded up to the nearest cluster.
-	 */
-	read_lock_irqsave(&ni->size_lock, flags);
-	ll = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (end > ll) {
-		/*
-		 * Extend the allocation without changing the data size.
-		 *
-		 * Note we ensure the allocation is big enough to at least
-		 * write some data but we do not require the allocation to be
-		 * complete, i.e. it may be partial.
-		 */
-		ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
-		if (likely(ll >= 0)) {
-			BUG_ON(pos >= ll);
-			/* If the extension was partial truncate the write. */
-			if (end > ll) {
-				ntfs_debug("Truncating write to inode 0x%lx, "
-						"attribute type 0x%x, because "
-						"the allocation was only "
-						"partially extended.",
-						vi->i_ino, (unsigned)
-						le32_to_cpu(ni->type));
-				iov_iter_truncate(from, ll - pos);
-			}
-		} else {
-			err = ll;
-			read_lock_irqsave(&ni->size_lock, flags);
-			ll = ni->allocated_size;
-			read_unlock_irqrestore(&ni->size_lock, flags);
-			/* Perform a partial write if possible or fail. */
-			if (pos < ll) {
-				ntfs_debug("Truncating write to inode 0x%lx "
-						"attribute type 0x%x, because "
-						"extending the allocation "
-						"failed (error %d).",
-						vi->i_ino, (unsigned)
-						le32_to_cpu(ni->type),
-						(int)-err);
-				iov_iter_truncate(from, ll - pos);
-			} else {
-				if (err != -ENOSPC)
-					ntfs_error(vi->i_sb, "Cannot perform "
-							"write to inode "
-							"0x%lx, attribute "
-							"type 0x%x, because "
-							"extending the "
-							"allocation failed "
-							"(error %ld).",
-							vi->i_ino, (unsigned)
-							le32_to_cpu(ni->type),
-							(long)-err);
-				else
-					ntfs_debug("Cannot perform write to "
-							"inode 0x%lx, "
-							"attribute type 0x%x, "
-							"because there is not "
-							"space left.",
-							vi->i_ino, (unsigned)
-							le32_to_cpu(ni->type));
-				goto out;
-			}
-		}
-	}
-	/*
-	 * If the write starts beyond the initialized size, extend it up to the
-	 * beginning of the write and initialize all non-sparse space between
-	 * the old initialized size and the new one.  This automatically also
-	 * increments the vfs inode->i_size to keep it above or equal to the
-	 * initialized_size.
-	 */
-	read_lock_irqsave(&ni->size_lock, flags);
-	ll = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (pos > ll) {
-		/*
-		 * Wait for ongoing direct i/o to complete before proceeding.
-		 * New direct i/o cannot start as we hold i_mutex.
-		 */
-		inode_dio_wait(vi);
-		err = ntfs_attr_extend_initialized(ni, pos);
-		if (unlikely(err < 0))
-			ntfs_error(vi->i_sb, "Cannot perform write to inode "
-					"0x%lx, attribute type 0x%x, because "
-					"extending the initialized size "
-					"failed (error %d).", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type),
-					(int)-err);
-	}
-out:
-	return err;
-}
-
-/**
- * __ntfs_grab_cache_pages - obtain a number of locked pages
- * @mapping:	address space mapping from which to obtain page cache pages
- * @index:	starting index in @mapping at which to begin obtaining pages
- * @nr_pages:	number of page cache pages to obtain
- * @pages:	array of pages in which to return the obtained page cache pages
- * @cached_page: allocated but as yet unused page
- *
- * Obtain @nr_pages locked page cache pages from the mapping @mapping and
- * starting at index @index.
- *
- * If a page is newly created, add it to lru list
- *
- * Note, the page locks are obtained in ascending page index order.
- */
-static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
-		pgoff_t index, const unsigned nr_pages, struct page **pages,
-		struct page **cached_page)
-{
-	int err, nr;
-
-	BUG_ON(!nr_pages);
-	err = nr = 0;
-	do {
-		pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
-				FGP_ACCESSED);
-		if (!pages[nr]) {
-			if (!*cached_page) {
-				*cached_page = page_cache_alloc(mapping);
-				if (unlikely(!*cached_page)) {
-					err = -ENOMEM;
-					goto err_out;
-				}
-			}
-			err = add_to_page_cache_lru(*cached_page, mapping,
-				   index,
-				   mapping_gfp_constraint(mapping, GFP_KERNEL));
-			if (unlikely(err)) {
-				if (err == -EEXIST)
-					continue;
-				goto err_out;
-			}
-			pages[nr] = *cached_page;
-			*cached_page = NULL;
-		}
-		index++;
-		nr++;
-	} while (nr < nr_pages);
-out:
-	return err;
-err_out:
-	while (nr > 0) {
-		unlock_page(pages[--nr]);
-		put_page(pages[nr]);
-	}
-	goto out;
-}
-
-static inline void ntfs_submit_bh_for_read(struct buffer_head *bh)
-{
-	lock_buffer(bh);
-	get_bh(bh);
-	bh->b_end_io = end_buffer_read_sync;
-	submit_bh(REQ_OP_READ, bh);
-}
-
-/**
- * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
- * @pages:	array of destination pages
- * @nr_pages:	number of pages in @pages
- * @pos:	byte position in file at which the write begins
- * @bytes:	number of bytes to be written
- *
- * This is called for non-resident attributes from ntfs_file_buffered_write()
- * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
- * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
- * data has not yet been copied into the @pages.
- * 
- * Need to fill any holes with actual clusters, allocate buffers if necessary,
- * ensure all the buffers are mapped, and bring uptodate any buffers that are
- * only partially being written to.
- *
- * If @nr_pages is greater than one, we are guaranteed that the cluster size is
- * greater than PAGE_SIZE, that all pages in @pages are entirely inside
- * the same cluster and that they are the entirety of that cluster, and that
- * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
- *
- * i_size is not to be modified yet.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
-		unsigned nr_pages, s64 pos, size_t bytes)
-{
-	VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
-	LCN lcn;
-	s64 bh_pos, vcn_len, end, initialized_size;
-	sector_t lcn_block;
-	struct folio *folio;
-	struct inode *vi;
-	ntfs_inode *ni, *base_ni = NULL;
-	ntfs_volume *vol;
-	runlist_element *rl, *rl2;
-	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
-	ntfs_attr_search_ctx *ctx = NULL;
-	MFT_RECORD *m = NULL;
-	ATTR_RECORD *a = NULL;
-	unsigned long flags;
-	u32 attr_rec_len = 0;
-	unsigned blocksize, u;
-	int err, mp_size;
-	bool rl_write_locked, was_hole, is_retry;
-	unsigned char blocksize_bits;
-	struct {
-		u8 runlist_merged:1;
-		u8 mft_attr_mapped:1;
-		u8 mp_rebuilt:1;
-		u8 attr_switched:1;
-	} status = { 0, 0, 0, 0 };
-
-	BUG_ON(!nr_pages);
-	BUG_ON(!pages);
-	BUG_ON(!*pages);
-	vi = pages[0]->mapping->host;
-	ni = NTFS_I(vi);
-	vol = ni->vol;
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
-			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
-			vi->i_ino, ni->type, pages[0]->index, nr_pages,
-			(long long)pos, bytes);
-	blocksize = vol->sb->s_blocksize;
-	blocksize_bits = vol->sb->s_blocksize_bits;
-	rl_write_locked = false;
-	rl = NULL;
-	err = 0;
-	vcn = lcn = -1;
-	vcn_len = 0;
-	lcn_block = -1;
-	was_hole = false;
-	cpos = pos >> vol->cluster_size_bits;
-	end = pos + bytes;
-	cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
-	/*
-	 * Loop over each buffer in each folio.  Use goto to
-	 * reduce indentation.
-	 */
-	u = 0;
-do_next_folio:
-	folio = page_folio(pages[u]);
-	bh_pos = folio_pos(folio);
-	head = folio_buffers(folio);
-	if (!head)
-		/*
-		 * create_empty_buffers() will create uptodate/dirty
-		 * buffers if the folio is uptodate/dirty.
-		 */
-		head = create_empty_buffers(folio, blocksize, 0);
-	bh = head;
-	do {
-		VCN cdelta;
-		s64 bh_end;
-		unsigned bh_cofs;
-
-		/* Clear buffer_new on all buffers to reinitialise state. */
-		if (buffer_new(bh))
-			clear_buffer_new(bh);
-		bh_end = bh_pos + blocksize;
-		bh_cpos = bh_pos >> vol->cluster_size_bits;
-		bh_cofs = bh_pos & vol->cluster_size_mask;
-		if (buffer_mapped(bh)) {
-			/*
-			 * The buffer is already mapped.  If it is uptodate,
-			 * ignore it.
-			 */
-			if (buffer_uptodate(bh))
-				continue;
-			/*
-			 * The buffer is not uptodate.  If the folio is uptodate
-			 * set the buffer uptodate and otherwise ignore it.
-			 */
-			if (folio_test_uptodate(folio)) {
-				set_buffer_uptodate(bh);
-				continue;
-			}
-			/*
-			 * Neither the folio nor the buffer are uptodate.  If
-			 * the buffer is only partially being written to, we
-			 * need to read it in before the write, i.e. now.
-			 */
-			if ((bh_pos < pos && bh_end > pos) ||
-					(bh_pos < end && bh_end > end)) {
-				/*
-				 * If the buffer is fully or partially within
-				 * the initialized size, do an actual read.
-				 * Otherwise, simply zero the buffer.
-				 */
-				read_lock_irqsave(&ni->size_lock, flags);
-				initialized_size = ni->initialized_size;
-				read_unlock_irqrestore(&ni->size_lock, flags);
-				if (bh_pos < initialized_size) {
-					ntfs_submit_bh_for_read(bh);
-					*wait_bh++ = bh;
-				} else {
-					folio_zero_range(folio, bh_offset(bh),
-							blocksize);
-					set_buffer_uptodate(bh);
-				}
-			}
-			continue;
-		}
-		/* Unmapped buffer.  Need to map it. */
-		bh->b_bdev = vol->sb->s_bdev;
-		/*
-		 * If the current buffer is in the same clusters as the map
-		 * cache, there is no need to check the runlist again.  The
-		 * map cache is made up of @vcn, which is the first cached file
-		 * cluster, @vcn_len which is the number of cached file
-		 * clusters, @lcn is the device cluster corresponding to @vcn,
-		 * and @lcn_block is the block number corresponding to @lcn.
-		 */
-		cdelta = bh_cpos - vcn;
-		if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
-map_buffer_cached:
-			BUG_ON(lcn < 0);
-			bh->b_blocknr = lcn_block +
-					(cdelta << (vol->cluster_size_bits -
-					blocksize_bits)) +
-					(bh_cofs >> blocksize_bits);
-			set_buffer_mapped(bh);
-			/*
-			 * If the folio is uptodate so is the buffer.  If the
-			 * buffer is fully outside the write, we ignore it if
-			 * it was already allocated and we mark it dirty so it
-			 * gets written out if we allocated it.  On the other
-			 * hand, if we allocated the buffer but we are not
-			 * marking it dirty we set buffer_new so we can do
-			 * error recovery.
-			 */
-			if (folio_test_uptodate(folio)) {
-				if (!buffer_uptodate(bh))
-					set_buffer_uptodate(bh);
-				if (unlikely(was_hole)) {
-					/* We allocated the buffer. */
-					clean_bdev_bh_alias(bh);
-					if (bh_end <= pos || bh_pos >= end)
-						mark_buffer_dirty(bh);
-					else
-						set_buffer_new(bh);
-				}
-				continue;
-			}
-			/* Page is _not_ uptodate. */
-			if (likely(!was_hole)) {
-				/*
-				 * Buffer was already allocated.  If it is not
-				 * uptodate and is only partially being written
-				 * to, we need to read it in before the write,
-				 * i.e. now.
-				 */
-				if (!buffer_uptodate(bh) && bh_pos < end &&
-						bh_end > pos &&
-						(bh_pos < pos ||
-						bh_end > end)) {
-					/*
-					 * If the buffer is fully or partially
-					 * within the initialized size, do an
-					 * actual read.  Otherwise, simply zero
-					 * the buffer.
-					 */
-					read_lock_irqsave(&ni->size_lock,
-							flags);
-					initialized_size = ni->initialized_size;
-					read_unlock_irqrestore(&ni->size_lock,
-							flags);
-					if (bh_pos < initialized_size) {
-						ntfs_submit_bh_for_read(bh);
-						*wait_bh++ = bh;
-					} else {
-						folio_zero_range(folio,
-								bh_offset(bh),
-								blocksize);
-						set_buffer_uptodate(bh);
-					}
-				}
-				continue;
-			}
-			/* We allocated the buffer. */
-			clean_bdev_bh_alias(bh);
-			/*
-			 * If the buffer is fully outside the write, zero it,
-			 * set it uptodate, and mark it dirty so it gets
-			 * written out.  If it is partially being written to,
-			 * zero region surrounding the write but leave it to
-			 * commit write to do anything else.  Finally, if the
-			 * buffer is fully being overwritten, do nothing.
-			 */
-			if (bh_end <= pos || bh_pos >= end) {
-				if (!buffer_uptodate(bh)) {
-					folio_zero_range(folio, bh_offset(bh),
-							blocksize);
-					set_buffer_uptodate(bh);
-				}
-				mark_buffer_dirty(bh);
-				continue;
-			}
-			set_buffer_new(bh);
-			if (!buffer_uptodate(bh) &&
-					(bh_pos < pos || bh_end > end)) {
-				u8 *kaddr;
-				unsigned pofs;
-					
-				kaddr = kmap_local_folio(folio, 0);
-				if (bh_pos < pos) {
-					pofs = bh_pos & ~PAGE_MASK;
-					memset(kaddr + pofs, 0, pos - bh_pos);
-				}
-				if (bh_end > end) {
-					pofs = end & ~PAGE_MASK;
-					memset(kaddr + pofs, 0, bh_end - end);
-				}
-				kunmap_local(kaddr);
-				flush_dcache_folio(folio);
-			}
-			continue;
-		}
-		/*
-		 * Slow path: this is the first buffer in the cluster.  If it
-		 * is outside allocated size and is not uptodate, zero it and
-		 * set it uptodate.
-		 */
-		read_lock_irqsave(&ni->size_lock, flags);
-		initialized_size = ni->allocated_size;
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (bh_pos > initialized_size) {
-			if (folio_test_uptodate(folio)) {
-				if (!buffer_uptodate(bh))
-					set_buffer_uptodate(bh);
-			} else if (!buffer_uptodate(bh)) {
-				folio_zero_range(folio, bh_offset(bh),
-						blocksize);
-				set_buffer_uptodate(bh);
-			}
-			continue;
-		}
-		is_retry = false;
-		if (!rl) {
-			down_read(&ni->runlist.lock);
-retry_remap:
-			rl = ni->runlist.rl;
-		}
-		if (likely(rl != NULL)) {
-			/* Seek to element containing target cluster. */
-			while (rl->length && rl[1].vcn <= bh_cpos)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
-			if (likely(lcn >= 0)) {
-				/*
-				 * Successful remap, setup the map cache and
-				 * use that to deal with the buffer.
-				 */
-				was_hole = false;
-				vcn = bh_cpos;
-				vcn_len = rl[1].vcn - vcn;
-				lcn_block = lcn << (vol->cluster_size_bits -
-						blocksize_bits);
-				cdelta = 0;
-				/*
-				 * If the number of remaining clusters touched
-				 * by the write is smaller or equal to the
-				 * number of cached clusters, unlock the
-				 * runlist as the map cache will be used from
-				 * now on.
-				 */
-				if (likely(vcn + vcn_len >= cend)) {
-					if (rl_write_locked) {
-						up_write(&ni->runlist.lock);
-						rl_write_locked = false;
-					} else
-						up_read(&ni->runlist.lock);
-					rl = NULL;
-				}
-				goto map_buffer_cached;
-			}
-		} else
-			lcn = LCN_RL_NOT_MAPPED;
-		/*
-		 * If it is not a hole and not out of bounds, the runlist is
-		 * probably unmapped so try to map it now.
-		 */
-		if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
-			if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
-				/* Attempt to map runlist. */
-				if (!rl_write_locked) {
-					/*
-					 * We need the runlist locked for
-					 * writing, so if it is locked for
-					 * reading relock it now and retry in
-					 * case it changed whilst we dropped
-					 * the lock.
-					 */
-					up_read(&ni->runlist.lock);
-					down_write(&ni->runlist.lock);
-					rl_write_locked = true;
-					goto retry_remap;
-				}
-				err = ntfs_map_runlist_nolock(ni, bh_cpos,
-						NULL);
-				if (likely(!err)) {
-					is_retry = true;
-					goto retry_remap;
-				}
-				/*
-				 * If @vcn is out of bounds, pretend @lcn is
-				 * LCN_ENOENT.  As long as the buffer is out
-				 * of bounds this will work fine.
-				 */
-				if (err == -ENOENT) {
-					lcn = LCN_ENOENT;
-					err = 0;
-					goto rl_not_mapped_enoent;
-				}
-			} else
-				err = -EIO;
-			/* Failed to map the buffer, even after retrying. */
-			bh->b_blocknr = -1;
-			ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
-					"attribute type 0x%x, vcn 0x%llx, "
-					"vcn offset 0x%x, because its "
-					"location on disk could not be "
-					"determined%s (error code %i).",
-					ni->mft_no, ni->type,
-					(unsigned long long)bh_cpos,
-					(unsigned)bh_pos &
-					vol->cluster_size_mask,
-					is_retry ? " even after retrying" : "",
-					err);
-			break;
-		}
-rl_not_mapped_enoent:
-		/*
-		 * The buffer is in a hole or out of bounds.  We need to fill
-		 * the hole, unless the buffer is in a cluster which is not
-		 * touched by the write, in which case we just leave the buffer
-		 * unmapped.  This can only happen when the cluster size is
-		 * less than the page cache size.
-		 */
-		if (unlikely(vol->cluster_size < PAGE_SIZE)) {
-			bh_cend = (bh_end + vol->cluster_size - 1) >>
-					vol->cluster_size_bits;
-			if ((bh_cend <= cpos || bh_cpos >= cend)) {
-				bh->b_blocknr = -1;
-				/*
-				 * If the buffer is uptodate we skip it.  If it
-				 * is not but the folio is uptodate, we can set
-				 * the buffer uptodate.  If the folio is not
-				 * uptodate, we can clear the buffer and set it
-				 * uptodate.  Whether this is worthwhile is
-				 * debatable and this could be removed.
-				 */
-				if (folio_test_uptodate(folio)) {
-					if (!buffer_uptodate(bh))
-						set_buffer_uptodate(bh);
-				} else if (!buffer_uptodate(bh)) {
-					folio_zero_range(folio, bh_offset(bh),
-						blocksize);
-					set_buffer_uptodate(bh);
-				}
-				continue;
-			}
-		}
-		/*
-		 * Out of bounds buffer is invalid if it was not really out of
-		 * bounds.
-		 */
-		BUG_ON(lcn != LCN_HOLE);
-		/*
-		 * We need the runlist locked for writing, so if it is locked
-		 * for reading relock it now and retry in case it changed
-		 * whilst we dropped the lock.
-		 */
-		BUG_ON(!rl);
-		if (!rl_write_locked) {
-			up_read(&ni->runlist.lock);
-			down_write(&ni->runlist.lock);
-			rl_write_locked = true;
-			goto retry_remap;
-		}
-		/* Find the previous last allocated cluster. */
-		BUG_ON(rl->lcn != LCN_HOLE);
-		lcn = -1;
-		rl2 = rl;
-		while (--rl2 >= ni->runlist.rl) {
-			if (rl2->lcn >= 0) {
-				lcn = rl2->lcn + rl2->length;
-				break;
-			}
-		}
-		rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
-				false);
-		if (IS_ERR(rl2)) {
-			err = PTR_ERR(rl2);
-			ntfs_debug("Failed to allocate cluster, error code %i.",
-					err);
-			break;
-		}
-		lcn = rl2->lcn;
-		rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
-		if (IS_ERR(rl)) {
-			err = PTR_ERR(rl);
-			if (err != -ENOMEM)
-				err = -EIO;
-			if (ntfs_cluster_free_from_rl(vol, rl2)) {
-				ntfs_error(vol->sb, "Failed to release "
-						"allocated cluster in error "
-						"code path.  Run chkdsk to "
-						"recover the lost cluster.");
-				NVolSetErrors(vol);
-			}
-			ntfs_free(rl2);
-			break;
-		}
-		ni->runlist.rl = rl;
-		status.runlist_merged = 1;
-		ntfs_debug("Allocated cluster, lcn 0x%llx.",
-				(unsigned long long)lcn);
-		/* Map and lock the mft record and get the attribute record. */
-		if (!NInoAttr(ni))
-			base_ni = ni;
-		else
-			base_ni = ni->ext.base_ntfs_ino;
-		m = map_mft_record(base_ni);
-		if (IS_ERR(m)) {
-			err = PTR_ERR(m);
-			break;
-		}
-		ctx = ntfs_attr_get_search_ctx(base_ni, m);
-		if (unlikely(!ctx)) {
-			err = -ENOMEM;
-			unmap_mft_record(base_ni);
-			break;
-		}
-		status.mft_attr_mapped = 1;
-		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT)
-				err = -EIO;
-			break;
-		}
-		m = ctx->mrec;
-		a = ctx->attr;
-		/*
-		 * Find the runlist element with which the attribute extent
-		 * starts.  Note, we cannot use the _attr_ version because we
-		 * have mapped the mft record.  That is ok because we know the
-		 * runlist fragment must be mapped already to have ever gotten
-		 * here, so we can just use the _rl_ version.
-		 */
-		vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
-		rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
-		BUG_ON(!rl2);
-		BUG_ON(!rl2->length);
-		BUG_ON(rl2->lcn < LCN_HOLE);
-		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
-		/*
-		 * If @highest_vcn is zero, calculate the real highest_vcn
-		 * (which can really be zero).
-		 */
-		if (!highest_vcn)
-			highest_vcn = (sle64_to_cpu(
-					a->data.non_resident.allocated_size) >>
-					vol->cluster_size_bits) - 1;
-		/*
-		 * Determine the size of the mapping pairs array for the new
-		 * extent, i.e. the old extent with the hole filled.
-		 */
-		mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
-				highest_vcn);
-		if (unlikely(mp_size <= 0)) {
-			if (!(err = mp_size))
-				err = -EIO;
-			ntfs_debug("Failed to get size for mapping pairs "
-					"array, error code %i.", err);
-			break;
-		}
-		/*
-		 * Resize the attribute record to fit the new mapping pairs
-		 * array.
-		 */
-		attr_rec_len = le32_to_cpu(a->length);
-		err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset));
-		if (unlikely(err)) {
-			BUG_ON(err != -ENOSPC);
-			// TODO: Deal with this by using the current attribute
-			// and fill it with as much of the mapping pairs
-			// array as possible.  Then loop over each attribute
-			// extent rewriting the mapping pairs arrays as we go
-			// along and if when we reach the end we have not
-			// enough space, try to resize the last attribute
-			// extent and if even that fails, add a new attribute
-			// extent.
-			// We could also try to resize at each step in the hope
-			// that we will not need to rewrite every single extent.
-			// Note, we may need to decompress some extents to fill
-			// the runlist as we are walking the extents...
-			ntfs_error(vol->sb, "Not enough space in the mft "
-					"record for the extended attribute "
-					"record.  This case is not "
-					"implemented yet.");
-			err = -EOPNOTSUPP;
-			break ;
-		}
-		status.mp_rebuilt = 1;
-		/*
-		 * Generate the mapping pairs array directly into the attribute
-		 * record.
-		 */
-		err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset),
-				mp_size, rl2, vcn, highest_vcn, NULL);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
-					"attribute type 0x%x, because building "
-					"the mapping pairs failed with error "
-					"code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-			err = -EIO;
-			break;
-		}
-		/* Update the highest_vcn but only if it was not set. */
-		if (unlikely(!a->data.non_resident.highest_vcn))
-			a->data.non_resident.highest_vcn =
-					cpu_to_sle64(highest_vcn);
-		/*
-		 * If the attribute is sparse/compressed, update the compressed
-		 * size in the ntfs_inode structure and the attribute record.
-		 */
-		if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
-			/*
-			 * If we are not in the first attribute extent, switch
-			 * to it, but first ensure the changes will make it to
-			 * disk later.
-			 */
-			if (a->data.non_resident.lowest_vcn) {
-				flush_dcache_mft_record_page(ctx->ntfs_ino);
-				mark_mft_record_dirty(ctx->ntfs_ino);
-				ntfs_attr_reinit_search_ctx(ctx);
-				err = ntfs_attr_lookup(ni->type, ni->name,
-						ni->name_len, CASE_SENSITIVE,
-						0, NULL, 0, ctx);
-				if (unlikely(err)) {
-					status.attr_switched = 1;
-					break;
-				}
-				/* @m is not used any more so do not set it. */
-				a = ctx->attr;
-			}
-			write_lock_irqsave(&ni->size_lock, flags);
-			ni->itype.compressed.size += vol->cluster_size;
-			a->data.non_resident.compressed_size =
-					cpu_to_sle64(ni->itype.compressed.size);
-			write_unlock_irqrestore(&ni->size_lock, flags);
-		}
-		/* Ensure the changes make it to disk. */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-		/* Successfully filled the hole. */
-		status.runlist_merged = 0;
-		status.mft_attr_mapped = 0;
-		status.mp_rebuilt = 0;
-		/* Setup the map cache and use that to deal with the buffer. */
-		was_hole = true;
-		vcn = bh_cpos;
-		vcn_len = 1;
-		lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
-		cdelta = 0;
-		/*
-		 * If the number of remaining clusters in the @pages is smaller
-		 * or equal to the number of cached clusters, unlock the
-		 * runlist as the map cache will be used from now on.
-		 */
-		if (likely(vcn + vcn_len >= cend)) {
-			up_write(&ni->runlist.lock);
-			rl_write_locked = false;
-			rl = NULL;
-		}
-		goto map_buffer_cached;
-	} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
-	/* If there are no errors, do the next page. */
-	if (likely(!err && ++u < nr_pages))
-		goto do_next_folio;
-	/* If there are no errors, release the runlist lock if we took it. */
-	if (likely(!err)) {
-		if (unlikely(rl_write_locked)) {
-			up_write(&ni->runlist.lock);
-			rl_write_locked = false;
-		} else if (unlikely(rl))
-			up_read(&ni->runlist.lock);
-		rl = NULL;
-	}
-	/* If we issued read requests, let them complete. */
-	read_lock_irqsave(&ni->size_lock, flags);
-	initialized_size = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	while (wait_bh > wait) {
-		bh = *--wait_bh;
-		wait_on_buffer(bh);
-		if (likely(buffer_uptodate(bh))) {
-			folio = bh->b_folio;
-			bh_pos = folio_pos(folio) + bh_offset(bh);
-			/*
-			 * If the buffer overflows the initialized size, need
-			 * to zero the overflowing region.
-			 */
-			if (unlikely(bh_pos + blocksize > initialized_size)) {
-				int ofs = 0;
-
-				if (likely(bh_pos < initialized_size))
-					ofs = initialized_size - bh_pos;
-				folio_zero_segment(folio, bh_offset(bh) + ofs,
-						blocksize);
-			}
-		} else /* if (unlikely(!buffer_uptodate(bh))) */
-			err = -EIO;
-	}
-	if (likely(!err)) {
-		/* Clear buffer_new on all buffers. */
-		u = 0;
-		do {
-			bh = head = page_buffers(pages[u]);
-			do {
-				if (buffer_new(bh))
-					clear_buffer_new(bh);
-			} while ((bh = bh->b_this_page) != head);
-		} while (++u < nr_pages);
-		ntfs_debug("Done.");
-		return err;
-	}
-	if (status.attr_switched) {
-		/* Get back to the attribute extent we modified. */
-		ntfs_attr_reinit_search_ctx(ctx);
-		if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
-			ntfs_error(vol->sb, "Failed to find required "
-					"attribute extent of attribute in "
-					"error code path.  Run chkdsk to "
-					"recover.");
-			write_lock_irqsave(&ni->size_lock, flags);
-			ni->itype.compressed.size += vol->cluster_size;
-			write_unlock_irqrestore(&ni->size_lock, flags);
-			flush_dcache_mft_record_page(ctx->ntfs_ino);
-			mark_mft_record_dirty(ctx->ntfs_ino);
-			/*
-			 * The only thing that is now wrong is the compressed
-			 * size of the base attribute extent which chkdsk
-			 * should be able to fix.
-			 */
-			NVolSetErrors(vol);
-		} else {
-			m = ctx->mrec;
-			a = ctx->attr;
-			status.attr_switched = 0;
-		}
-	}
-	/*
-	 * If the runlist has been modified, need to restore it by punching a
-	 * hole into it and we then need to deallocate the on-disk cluster as
-	 * well.  Note, we only modify the runlist if we are able to generate a
-	 * new mapping pairs array, i.e. only when the mapped attribute extent
-	 * is not switched.
-	 */
-	if (status.runlist_merged && !status.attr_switched) {
-		BUG_ON(!rl_write_locked);
-		/* Make the file cluster we allocated sparse in the runlist. */
-		if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
-			ntfs_error(vol->sb, "Failed to punch hole into "
-					"attribute runlist in error code "
-					"path.  Run chkdsk to recover the "
-					"lost cluster.");
-			NVolSetErrors(vol);
-		} else /* if (success) */ {
-			status.runlist_merged = 0;
-			/*
-			 * Deallocate the on-disk cluster we allocated but only
-			 * if we succeeded in punching its vcn out of the
-			 * runlist.
-			 */
-			down_write(&vol->lcnbmp_lock);
-			if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
-				ntfs_error(vol->sb, "Failed to release "
-						"allocated cluster in error "
-						"code path.  Run chkdsk to "
-						"recover the lost cluster.");
-				NVolSetErrors(vol);
-			}
-			up_write(&vol->lcnbmp_lock);
-		}
-	}
-	/*
-	 * Resize the attribute record to its old size and rebuild the mapping
-	 * pairs array.  Note, we only can do this if the runlist has been
-	 * restored to its old state which also implies that the mapped
-	 * attribute extent is not switched.
-	 */
-	if (status.mp_rebuilt && !status.runlist_merged) {
-		if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
-			ntfs_error(vol->sb, "Failed to restore attribute "
-					"record in error code path.  Run "
-					"chkdsk to recover.");
-			NVolSetErrors(vol);
-		} else /* if (success) */ {
-			if (ntfs_mapping_pairs_build(vol, (u8*)a +
-					le16_to_cpu(a->data.non_resident.
-					mapping_pairs_offset), attr_rec_len -
-					le16_to_cpu(a->data.non_resident.
-					mapping_pairs_offset), ni->runlist.rl,
-					vcn, highest_vcn, NULL)) {
-				ntfs_error(vol->sb, "Failed to restore "
-						"mapping pairs array in error "
-						"code path.  Run chkdsk to "
-						"recover.");
-				NVolSetErrors(vol);
-			}
-			flush_dcache_mft_record_page(ctx->ntfs_ino);
-			mark_mft_record_dirty(ctx->ntfs_ino);
-		}
-	}
-	/* Release the mft record and the attribute. */
-	if (status.mft_attr_mapped) {
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-	}
-	/* Release the runlist lock. */
-	if (rl_write_locked)
-		up_write(&ni->runlist.lock);
-	else if (rl)
-		up_read(&ni->runlist.lock);
-	/*
-	 * Zero out any newly allocated blocks to avoid exposing stale data.
-	 * If BH_New is set, we know that the block was newly allocated above
-	 * and that it has not been fully zeroed and marked dirty yet.
-	 */
-	nr_pages = u;
-	u = 0;
-	end = bh_cpos << vol->cluster_size_bits;
-	do {
-		folio = page_folio(pages[u]);
-		bh = head = folio_buffers(folio);
-		do {
-			if (u == nr_pages &&
-			    folio_pos(folio) + bh_offset(bh) >= end)
-				break;
-			if (!buffer_new(bh))
-				continue;
-			clear_buffer_new(bh);
-			if (!buffer_uptodate(bh)) {
-				if (folio_test_uptodate(folio))
-					set_buffer_uptodate(bh);
-				else {
-					folio_zero_range(folio, bh_offset(bh),
-							blocksize);
-					set_buffer_uptodate(bh);
-				}
-			}
-			mark_buffer_dirty(bh);
-		} while ((bh = bh->b_this_page) != head);
-	} while (++u <= nr_pages);
-	ntfs_error(vol->sb, "Failed.  Returning error code %i.", err);
-	return err;
-}
-
-static inline void ntfs_flush_dcache_pages(struct page **pages,
-		unsigned nr_pages)
-{
-	BUG_ON(!nr_pages);
-	/*
-	 * Warning: Do not do the decrement at the same time as the call to
-	 * flush_dcache_page() because it is a NULL macro on i386 and hence the
-	 * decrement never happens so the loop never terminates.
-	 */
-	do {
-		--nr_pages;
-		flush_dcache_page(pages[nr_pages]);
-	} while (nr_pages > 0);
-}
-
-/**
- * ntfs_commit_pages_after_non_resident_write - commit the received data
- * @pages:	array of destination pages
- * @nr_pages:	number of pages in @pages
- * @pos:	byte position in file at which the write begins
- * @bytes:	number of bytes to be written
- *
- * See description of ntfs_commit_pages_after_write(), below.
- */
-static inline int ntfs_commit_pages_after_non_resident_write(
-		struct page **pages, const unsigned nr_pages,
-		s64 pos, size_t bytes)
-{
-	s64 end, initialized_size;
-	struct inode *vi;
-	ntfs_inode *ni, *base_ni;
-	struct buffer_head *bh, *head;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	unsigned long flags;
-	unsigned blocksize, u;
-	int err;
-
-	vi = pages[0]->mapping->host;
-	ni = NTFS_I(vi);
-	blocksize = vi->i_sb->s_blocksize;
-	end = pos + bytes;
-	u = 0;
-	do {
-		s64 bh_pos;
-		struct page *page;
-		bool partial;
-
-		page = pages[u];
-		bh_pos = (s64)page->index << PAGE_SHIFT;
-		bh = head = page_buffers(page);
-		partial = false;
-		do {
-			s64 bh_end;
-
-			bh_end = bh_pos + blocksize;
-			if (bh_end <= pos || bh_pos >= end) {
-				if (!buffer_uptodate(bh))
-					partial = true;
-			} else {
-				set_buffer_uptodate(bh);
-				mark_buffer_dirty(bh);
-			}
-		} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
-		/*
-		 * If all buffers are now uptodate but the page is not, set the
-		 * page uptodate.
-		 */
-		if (!partial && !PageUptodate(page))
-			SetPageUptodate(page);
-	} while (++u < nr_pages);
-	/*
-	 * Finally, if we do not need to update initialized_size or i_size we
-	 * are finished.
-	 */
-	read_lock_irqsave(&ni->size_lock, flags);
-	initialized_size = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (end <= initialized_size) {
-		ntfs_debug("Done.");
-		return 0;
-	}
-	/*
-	 * Update initialized_size/i_size as appropriate, both in the inode and
-	 * the mft record.
-	 */
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/* Map, pin, and lock the mft record. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	BUG_ON(!NInoNonResident(ni));
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	a = ctx->attr;
-	BUG_ON(!a->non_resident);
-	write_lock_irqsave(&ni->size_lock, flags);
-	BUG_ON(end > ni->allocated_size);
-	ni->initialized_size = end;
-	a->data.non_resident.initialized_size = cpu_to_sle64(end);
-	if (end > i_size_read(vi)) {
-		i_size_write(vi, end);
-		a->data.non_resident.data_size =
-				a->data.non_resident.initialized_size;
-	}
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	/* Mark the mft record dirty, so it gets written back. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	ntfs_debug("Done.");
-	return 0;
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
-			"code %i).", err);
-	if (err != -ENOMEM)
-		NVolSetErrors(ni->vol);
-	return err;
-}
-
-/**
- * ntfs_commit_pages_after_write - commit the received data
- * @pages:	array of destination pages
- * @nr_pages:	number of pages in @pages
- * @pos:	byte position in file at which the write begins
- * @bytes:	number of bytes to be written
- *
- * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
- * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
- * locked but not kmap()ped.  The source data has already been copied into the
- * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
- * the data was copied (for non-resident attributes only) and it returned
- * success.
- *
- * Need to set uptodate and mark dirty all buffers within the boundary of the
- * write.  If all buffers in a page are uptodate we set the page uptodate, too.
- *
- * Setting the buffers dirty ensures that they get written out later when
- * ntfs_writepage() is invoked by the VM.
- *
- * Finally, we need to update i_size and initialized_size as appropriate both
- * in the inode and the mft record.
- *
- * This is modelled after fs/buffer.c::generic_commit_write(), which marks
- * buffers uptodate and dirty, sets the page uptodate if all buffers in the
- * page are uptodate, and updates i_size if the end of io is beyond i_size.  In
- * that case, it also marks the inode dirty.
- *
- * If things have gone as outlined in
- * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
- * content modifications here for non-resident attributes.  For resident
- * attributes we need to do the uptodate bringing here which we combine with
- * the copying into the mft record which means we save one atomic kmap.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_commit_pages_after_write(struct page **pages,
-		const unsigned nr_pages, s64 pos, size_t bytes)
-{
-	s64 end, initialized_size;
-	loff_t i_size;
-	struct inode *vi;
-	ntfs_inode *ni, *base_ni;
-	struct page *page;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	char *kattr, *kaddr;
-	unsigned long flags;
-	u32 attr_len;
-	int err;
-
-	BUG_ON(!nr_pages);
-	BUG_ON(!pages);
-	page = pages[0];
-	BUG_ON(!page);
-	vi = page->mapping->host;
-	ni = NTFS_I(vi);
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
-			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
-			vi->i_ino, ni->type, page->index, nr_pages,
-			(long long)pos, bytes);
-	if (NInoNonResident(ni))
-		return ntfs_commit_pages_after_non_resident_write(pages,
-				nr_pages, pos, bytes);
-	BUG_ON(nr_pages > 1);
-	/*
-	 * Attribute is resident, implying it is not compressed, encrypted, or
-	 * sparse.
-	 */
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	BUG_ON(NInoNonResident(ni));
-	/* Map, pin, and lock the mft record. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	a = ctx->attr;
-	BUG_ON(a->non_resident);
-	/* The total length of the attribute value. */
-	attr_len = le32_to_cpu(a->data.resident.value_length);
-	i_size = i_size_read(vi);
-	BUG_ON(attr_len != i_size);
-	BUG_ON(pos > attr_len);
-	end = pos + bytes;
-	BUG_ON(end > le32_to_cpu(a->length) -
-			le16_to_cpu(a->data.resident.value_offset));
-	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
-	kaddr = kmap_atomic(page);
-	/* Copy the received data from the page to the mft record. */
-	memcpy(kattr + pos, kaddr + pos, bytes);
-	/* Update the attribute length if necessary. */
-	if (end > attr_len) {
-		attr_len = end;
-		a->data.resident.value_length = cpu_to_le32(attr_len);
-	}
-	/*
-	 * If the page is not uptodate, bring the out of bounds area(s)
-	 * uptodate by copying data from the mft record to the page.
-	 */
-	if (!PageUptodate(page)) {
-		if (pos > 0)
-			memcpy(kaddr, kattr, pos);
-		if (end < attr_len)
-			memcpy(kaddr + end, kattr + end, attr_len - end);
-		/* Zero the region outside the end of the attribute value. */
-		memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-	}
-	kunmap_atomic(kaddr);
-	/* Update initialized_size/i_size if necessary. */
-	read_lock_irqsave(&ni->size_lock, flags);
-	initialized_size = ni->initialized_size;
-	BUG_ON(end > ni->allocated_size);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	BUG_ON(initialized_size != i_size);
-	if (end > initialized_size) {
-		write_lock_irqsave(&ni->size_lock, flags);
-		ni->initialized_size = end;
-		i_size_write(vi, end);
-		write_unlock_irqrestore(&ni->size_lock, flags);
-	}
-	/* Mark the mft record dirty, so it gets written back. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	ntfs_debug("Done.");
-	return 0;
-err_out:
-	if (err == -ENOMEM) {
-		ntfs_warning(vi->i_sb, "Error allocating memory required to "
-				"commit the write.");
-		if (PageUptodate(page)) {
-			ntfs_warning(vi->i_sb, "Page is uptodate, setting "
-					"dirty so the write will be retried "
-					"later on by the VM.");
-			/*
-			 * Put the page on mapping->dirty_pages, but leave its
-			 * buffers' dirty state as-is.
-			 */
-			__set_page_dirty_nobuffers(page);
-			err = 0;
-		} else
-			ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
-					"data has been lost.");
-	} else {
-		ntfs_error(vi->i_sb, "Resident attribute commit write failed "
-				"with error %i.", err);
-		NVolSetErrors(ni->vol);
-	}
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	return err;
-}
-
-/*
- * Copy as much as we can into the pages and return the number of bytes which
- * were successfully copied.  If a fault is encountered then clear the pages
- * out to (ofs + bytes) and return the number of bytes which were copied.
- */
-static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
-		unsigned ofs, struct iov_iter *i, size_t bytes)
-{
-	struct page **last_page = pages + nr_pages;
-	size_t total = 0;
-	unsigned len, copied;
-
-	do {
-		len = PAGE_SIZE - ofs;
-		if (len > bytes)
-			len = bytes;
-		copied = copy_page_from_iter_atomic(*pages, ofs, len, i);
-		total += copied;
-		bytes -= copied;
-		if (!bytes)
-			break;
-		if (copied < len)
-			goto err;
-		ofs = 0;
-	} while (++pages < last_page);
-out:
-	return total;
-err:
-	/* Zero the rest of the target like __copy_from_user(). */
-	len = PAGE_SIZE - copied;
-	do {
-		if (len > bytes)
-			len = bytes;
-		zero_user(*pages, copied, len);
-		bytes -= len;
-		copied = 0;
-		len = PAGE_SIZE;
-	} while (++pages < last_page);
-	goto out;
-}
-
-/**
- * ntfs_perform_write - perform buffered write to a file
- * @file:	file to write to
- * @i:		iov_iter with data to write
- * @pos:	byte offset in file at which to begin writing to
- */
-static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
-		loff_t pos)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct inode *vi = mapping->host;
-	ntfs_inode *ni = NTFS_I(vi);
-	ntfs_volume *vol = ni->vol;
-	struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
-	struct page *cached_page = NULL;
-	VCN last_vcn;
-	LCN lcn;
-	size_t bytes;
-	ssize_t status, written = 0;
-	unsigned nr_pages;
-
-	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
-			"0x%llx, count 0x%lx.", vi->i_ino,
-			(unsigned)le32_to_cpu(ni->type),
-			(unsigned long long)pos,
-			(unsigned long)iov_iter_count(i));
-	/*
-	 * If a previous ntfs_truncate() failed, repeat it and abort if it
-	 * fails again.
-	 */
-	if (unlikely(NInoTruncateFailed(ni))) {
-		int err;
-
-		inode_dio_wait(vi);
-		err = ntfs_truncate(vi);
-		if (err || NInoTruncateFailed(ni)) {
-			if (!err)
-				err = -EIO;
-			ntfs_error(vol->sb, "Cannot perform write to inode "
-					"0x%lx, attribute type 0x%x, because "
-					"ntfs_truncate() failed (error code "
-					"%i).", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-			return err;
-		}
-	}
-	/*
-	 * Determine the number of pages per cluster for non-resident
-	 * attributes.
-	 */
-	nr_pages = 1;
-	if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
-		nr_pages = vol->cluster_size >> PAGE_SHIFT;
-	last_vcn = -1;
-	do {
-		VCN vcn;
-		pgoff_t start_idx;
-		unsigned ofs, do_pages, u;
-		size_t copied;
-
-		start_idx = pos >> PAGE_SHIFT;
-		ofs = pos & ~PAGE_MASK;
-		bytes = PAGE_SIZE - ofs;
-		do_pages = 1;
-		if (nr_pages > 1) {
-			vcn = pos >> vol->cluster_size_bits;
-			if (vcn != last_vcn) {
-				last_vcn = vcn;
-				/*
-				 * Get the lcn of the vcn the write is in.  If
-				 * it is a hole, need to lock down all pages in
-				 * the cluster.
-				 */
-				down_read(&ni->runlist.lock);
-				lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
-						vol->cluster_size_bits, false);
-				up_read(&ni->runlist.lock);
-				if (unlikely(lcn < LCN_HOLE)) {
-					if (lcn == LCN_ENOMEM)
-						status = -ENOMEM;
-					else {
-						status = -EIO;
-						ntfs_error(vol->sb, "Cannot "
-							"perform write to "
-							"inode 0x%lx, "
-							"attribute type 0x%x, "
-							"because the attribute "
-							"is corrupt.",
-							vi->i_ino, (unsigned)
-							le32_to_cpu(ni->type));
-					}
-					break;
-				}
-				if (lcn == LCN_HOLE) {
-					start_idx = (pos & ~(s64)
-							vol->cluster_size_mask)
-							>> PAGE_SHIFT;
-					bytes = vol->cluster_size - (pos &
-							vol->cluster_size_mask);
-					do_pages = nr_pages;
-				}
-			}
-		}
-		if (bytes > iov_iter_count(i))
-			bytes = iov_iter_count(i);
-again:
-		/*
-		 * Bring in the user page(s) that we will copy from _first_.
-		 * Otherwise there is a nasty deadlock on copying from the same
-		 * page(s) as we are writing to, without it/them being marked
-		 * up-to-date.  Note, at present there is nothing to stop the
-		 * pages being swapped out between us bringing them into memory
-		 * and doing the actual copying.
-		 */
-		if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
-			status = -EFAULT;
-			break;
-		}
-		/* Get and lock @do_pages starting at index @start_idx. */
-		status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
-				pages, &cached_page);
-		if (unlikely(status))
-			break;
-		/*
-		 * For non-resident attributes, we need to fill any holes with
-		 * actual clusters and ensure all bufferes are mapped.  We also
-		 * need to bring uptodate any buffers that are only partially
-		 * being written to.
-		 */
-		if (NInoNonResident(ni)) {
-			status = ntfs_prepare_pages_for_non_resident_write(
-					pages, do_pages, pos, bytes);
-			if (unlikely(status)) {
-				do {
-					unlock_page(pages[--do_pages]);
-					put_page(pages[do_pages]);
-				} while (do_pages);
-				break;
-			}
-		}
-		u = (pos >> PAGE_SHIFT) - pages[0]->index;
-		copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
-					i, bytes);
-		ntfs_flush_dcache_pages(pages + u, do_pages - u);
-		status = 0;
-		if (likely(copied == bytes)) {
-			status = ntfs_commit_pages_after_write(pages, do_pages,
-					pos, bytes);
-		}
-		do {
-			unlock_page(pages[--do_pages]);
-			put_page(pages[do_pages]);
-		} while (do_pages);
-		if (unlikely(status < 0)) {
-			iov_iter_revert(i, copied);
-			break;
-		}
-		cond_resched();
-		if (unlikely(copied < bytes)) {
-			iov_iter_revert(i, copied);
-			if (copied)
-				bytes = copied;
-			else if (bytes > PAGE_SIZE - ofs)
-				bytes = PAGE_SIZE - ofs;
-			goto again;
-		}
-		pos += copied;
-		written += copied;
-		balance_dirty_pages_ratelimited(mapping);
-		if (fatal_signal_pending(current)) {
-			status = -EINTR;
-			break;
-		}
-	} while (iov_iter_count(i));
-	if (cached_page)
-		put_page(cached_page);
-	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
-			written ? "written" : "status", (unsigned long)written,
-			(long)status);
-	return written ? written : status;
-}
-
-/**
- * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
- * @iocb:	IO state structure
- * @from:	iov_iter with data to write
- *
- * Basically the same as generic_file_write_iter() except that it ends up
- * up calling ntfs_perform_write() instead of generic_perform_write() and that
- * O_DIRECT is not implemented.
- */
-static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *vi = file_inode(file);
-	ssize_t written = 0;
-	ssize_t err;
-
-	inode_lock(vi);
-	/* We can write back this queue in page reclaim. */
-	err = ntfs_prepare_file_for_write(iocb, from);
-	if (iov_iter_count(from) && !err)
-		written = ntfs_perform_write(file, from, iocb->ki_pos);
-	inode_unlock(vi);
-	iocb->ki_pos += written;
-	if (likely(written > 0))
-		written = generic_write_sync(iocb, written);
-	return written ? written : err;
-}
-
-/**
- * ntfs_file_fsync - sync a file to disk
- * @filp:	file to be synced
- * @datasync:	if non-zero only flush user data and not metadata
- *
- * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
- * system calls.  This function is inspired by fs/buffer.c::file_fsync().
- *
- * If @datasync is false, write the mft record and all associated extent mft
- * records as well as the $DATA attribute and then sync the block device.
- *
- * If @datasync is true and the attribute is non-resident, we skip the writing
- * of the mft record and all associated extent mft records (this might still
- * happen due to the write_inode_now() call).
- *
- * Also, if @datasync is true, we do not wait on the inode to be written out
- * but we always wait on the page cache pages to be written out.
- *
- * Locking: Caller must hold i_mutex on the inode.
- *
- * TODO: We should probably also write all attribute/index inodes associated
- * with this inode but since we have no simple way of getting to them we ignore
- * this problem for now.
- */
-static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
-			   int datasync)
-{
-	struct inode *vi = filp->f_mapping->host;
-	int err, ret = 0;
-
-	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-
-	err = file_write_and_wait_range(filp, start, end);
-	if (err)
-		return err;
-	inode_lock(vi);
-
-	BUG_ON(S_ISDIR(vi->i_mode));
-	if (!datasync || !NInoNonResident(NTFS_I(vi)))
-		ret = __ntfs_write_inode(vi, 1);
-	write_inode_now(vi, !datasync);
-	/*
-	 * NOTE: If we were to use mapping->private_list (see ext2 and
-	 * fs/buffer.c) for dirty blocks then we could optimize the below to be
-	 * sync_mapping_buffers(vi->i_mapping).
-	 */
-	err = sync_blockdev(vi->i_sb->s_bdev);
-	if (unlikely(err && !ret))
-		ret = err;
-	if (likely(!ret))
-		ntfs_debug("Done.");
-	else
-		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
-				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
-	inode_unlock(vi);
-	return ret;
-}
-
-#endif /* NTFS_RW */
-
-const struct file_operations ntfs_file_ops = {
-	.llseek		= generic_file_llseek,
-	.read_iter	= generic_file_read_iter,
-#ifdef NTFS_RW
-	.write_iter	= ntfs_file_write_iter,
-	.fsync		= ntfs_file_fsync,
-#endif /* NTFS_RW */
-	.mmap		= generic_file_mmap,
-	.open		= ntfs_file_open,
-	.splice_read	= filemap_splice_read,
-};
-
-const struct inode_operations ntfs_file_inode_ops = {
-#ifdef NTFS_RW
-	.setattr	= ntfs_setattr,
-#endif /* NTFS_RW */
-};
-
-const struct file_operations ntfs_empty_file_ops = {};
-
-const struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
deleted file mode 100644
index d46c2c03a032..000000000000
--- a/fs/ntfs/index.c
+++ /dev/null
@@ -1,440 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * index.c - NTFS kernel index handling.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#include <linux/slab.h>
-
-#include "aops.h"
-#include "collate.h"
-#include "debug.h"
-#include "index.h"
-#include "ntfs.h"
-
-/**
- * ntfs_index_ctx_get - allocate and initialize a new index context
- * @idx_ni:	ntfs index inode with which to initialize the context
- *
- * Allocate a new index context, initialize it with @idx_ni and return it.
- * Return NULL if allocation failed.
- *
- * Locking:  Caller must hold i_mutex on the index inode.
- */
-ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
-{
-	ntfs_index_context *ictx;
-
-	ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS);
-	if (ictx)
-		*ictx = (ntfs_index_context){ .idx_ni = idx_ni };
-	return ictx;
-}
-
-/**
- * ntfs_index_ctx_put - release an index context
- * @ictx:	index context to free
- *
- * Release the index context @ictx, releasing all associated resources.
- *
- * Locking:  Caller must hold i_mutex on the index inode.
- */
-void ntfs_index_ctx_put(ntfs_index_context *ictx)
-{
-	if (ictx->entry) {
-		if (ictx->is_in_root) {
-			if (ictx->actx)
-				ntfs_attr_put_search_ctx(ictx->actx);
-			if (ictx->base_ni)
-				unmap_mft_record(ictx->base_ni);
-		} else {
-			struct page *page = ictx->page;
-			if (page) {
-				BUG_ON(!PageLocked(page));
-				unlock_page(page);
-				ntfs_unmap_page(page);
-			}
-		}
-	}
-	kmem_cache_free(ntfs_index_ctx_cache, ictx);
-	return;
-}
-
-/**
- * ntfs_index_lookup - find a key in an index and return its index entry
- * @key:	[IN] key for which to search in the index
- * @key_len:	[IN] length of @key in bytes
- * @ictx:	[IN/OUT] context describing the index and the returned entry
- *
- * Before calling ntfs_index_lookup(), @ictx must have been obtained from a
- * call to ntfs_index_ctx_get().
- *
- * Look for the @key in the index specified by the index lookup context @ictx.
- * ntfs_index_lookup() walks the contents of the index looking for the @key.
- *
- * If the @key is found in the index, 0 is returned and @ictx is setup to
- * describe the index entry containing the matching @key.  @ictx->entry is the
- * index entry and @ictx->data and @ictx->data_len are the index entry data and
- * its length in bytes, respectively.
- *
- * If the @key is not found in the index, -ENOENT is returned and @ictx is
- * setup to describe the index entry whose key collates immediately after the
- * search @key, i.e. this is the position in the index at which an index entry
- * with a key of @key would need to be inserted.
- *
- * If an error occurs return the negative error code and @ictx is left
- * untouched.
- *
- * When finished with the entry and its data, call ntfs_index_ctx_put() to free
- * the context and other associated resources.
- *
- * If the index entry was modified, call flush_dcache_index_entry_page()
- * immediately after the modification and either ntfs_index_entry_mark_dirty()
- * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
- * ensure that the changes are written to disk.
- *
- * Locking:  - Caller must hold i_mutex on the index inode.
- *	     - Each page cache page in the index allocation mapping must be
- *	       locked whilst being accessed otherwise we may find a corrupt
- *	       page due to it being under ->writepage at the moment which
- *	       applies the mst protection fixups before writing out and then
- *	       removes them again after the write is complete after which it 
- *	       unlocks the page.
- */
-int ntfs_index_lookup(const void *key, const int key_len,
-		ntfs_index_context *ictx)
-{
-	VCN vcn, old_vcn;
-	ntfs_inode *idx_ni = ictx->idx_ni;
-	ntfs_volume *vol = idx_ni->vol;
-	struct super_block *sb = vol->sb;
-	ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino;
-	MFT_RECORD *m;
-	INDEX_ROOT *ir;
-	INDEX_ENTRY *ie;
-	INDEX_ALLOCATION *ia;
-	u8 *index_end, *kaddr;
-	ntfs_attr_search_ctx *actx;
-	struct address_space *ia_mapping;
-	struct page *page;
-	int rc, err = 0;
-
-	ntfs_debug("Entering.");
-	BUG_ON(!NInoAttr(idx_ni));
-	BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION);
-	BUG_ON(idx_ni->nr_extents != -1);
-	BUG_ON(!base_ni);
-	BUG_ON(!key);
-	BUG_ON(key_len <= 0);
-	if (!ntfs_is_collation_rule_supported(
-			idx_ni->itype.index.collation_rule)) {
-		ntfs_error(sb, "Index uses unsupported collation rule 0x%x.  "
-				"Aborting lookup.", le32_to_cpu(
-				idx_ni->itype.index.collation_rule));
-		return -EOPNOTSUPP;
-	}
-	/* Get hold of the mft record for the index inode. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
-				-PTR_ERR(m));
-		return PTR_ERR(m);
-	}
-	actx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!actx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Find the index root attribute in the mft record. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, actx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			ntfs_error(sb, "Index root attribute missing in inode "
-					"0x%lx.", idx_ni->mft_no);
-			err = -EIO;
-		}
-		goto err_out;
-	}
-	/* Get to the index root value (it has been verified in read_inode). */
-	ir = (INDEX_ROOT*)((u8*)actx->attr +
-			le16_to_cpu(actx->attr->data.resident.value_offset));
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ir->index +
-			le32_to_cpu(ir->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds checks. */
-		if ((u8*)ie < (u8*)actx->mrec || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->length) > index_end)
-			goto idx_err_out;
-		/*
-		 * The last entry cannot contain a key.  It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/* Further bounds checks. */
-		if ((u32)sizeof(INDEX_ENTRY_HEADER) +
-				le16_to_cpu(ie->key_length) >
-				le16_to_cpu(ie->data.vi.data_offset) ||
-				(u32)le16_to_cpu(ie->data.vi.data_offset) +
-				le16_to_cpu(ie->data.vi.data_length) >
-				le16_to_cpu(ie->length))
-			goto idx_err_out;
-		/* If the keys match perfectly, we setup @ictx and return 0. */
-		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
-				&ie->key, key_len)) {
-ir_done:
-			ictx->is_in_root = true;
-			ictx->ir = ir;
-			ictx->actx = actx;
-			ictx->base_ni = base_ni;
-			ictx->ia = NULL;
-			ictx->page = NULL;
-done:
-			ictx->entry = ie;
-			ictx->data = (u8*)ie +
-					le16_to_cpu(ie->data.vi.data_offset);
-			ictx->data_len = le16_to_cpu(ie->data.vi.data_length);
-			ntfs_debug("Done.");
-			return err;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
-				key_len, &ie->key, le16_to_cpu(ie->key_length));
-		/*
-		 * If @key collates before the key of the current entry, there
-		 * is definitely no such key in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/*
-		 * A match should never happen as the memcmp() call should have
-		 * cought it, but we still treat it correctly.
-		 */
-		if (!rc)
-			goto ir_done;
-		/* The keys are not equal, continue the search. */
-	}
-	/*
-	 * We have finished with this index without success.  Check for the
-	 * presence of a child node and if not present setup @ictx and return
-	 * -ENOENT.
-	 */
-	if (!(ie->flags & INDEX_ENTRY_NODE)) {
-		ntfs_debug("Entry not found.");
-		err = -ENOENT;
-		goto ir_done;
-	} /* Child node present, descend into it. */
-	/* Consistency check: Verify that an index allocation exists. */
-	if (!NInoIndexAllocPresent(idx_ni)) {
-		ntfs_error(sb, "No index allocation attribute but index entry "
-				"requires one.  Inode 0x%lx is corrupt or "
-				"driver bug.", idx_ni->mft_no);
-		goto err_out;
-	}
-	/* Get the starting vcn of the index_block holding the child node. */
-	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
-	ia_mapping = VFS_I(idx_ni)->i_mapping;
-	/*
-	 * We are done with the index root and the mft record.  Release them,
-	 * otherwise we deadlock with ntfs_map_page().
-	 */
-	ntfs_attr_put_search_ctx(actx);
-	unmap_mft_record(base_ni);
-	m = NULL;
-	actx = NULL;
-descend_into_child_node:
-	/*
-	 * Convert vcn to index into the index allocation attribute in units
-	 * of PAGE_SIZE and map the page cache page, reading it from
-	 * disk if necessary.
-	 */
-	page = ntfs_map_page(ia_mapping, vcn <<
-			idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
-	if (IS_ERR(page)) {
-		ntfs_error(sb, "Failed to map index page, error %ld.",
-				-PTR_ERR(page));
-		err = PTR_ERR(page);
-		goto err_out;
-	}
-	lock_page(page);
-	kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
-	/* Get to the index allocation block. */
-	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
-			idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
-	/* Bounds checks. */
-	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Out of bounds check failed.  Corrupt inode "
-				"0x%lx or driver bug.", idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* Catch multi sector transfer fixup errors. */
-	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
-		ntfs_error(sb, "Index record with vcn 0x%llx is corrupt.  "
-				"Corrupt inode 0x%lx.  Run chkdsk.",
-				(long long)vcn, idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
-		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
-				"different from expected VCN (0x%llx).  Inode "
-				"0x%lx is corrupt or driver bug.",
-				(unsigned long long)
-				sle64_to_cpu(ia->index_block_vcn),
-				(unsigned long long)vcn, idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
-			idx_ni->itype.index.block_size) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has "
-				"a size (%u) differing from the index "
-				"specified size (%u).  Inode is corrupt or "
-				"driver bug.", (unsigned long long)vcn,
-				idx_ni->mft_no,
-				le32_to_cpu(ia->index.allocated_size) + 0x18,
-				idx_ni->itype.index.block_size);
-		goto unm_err_out;
-	}
-	index_end = (u8*)ia + idx_ni->itype.index.block_size;
-	if (index_end > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
-				"crosses page boundary.  Impossible!  Cannot "
-				"access!  This is probably a bug in the "
-				"driver.", (unsigned long long)vcn,
-				idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
-	if (index_end > (u8*)ia + idx_ni->itype.index.block_size) {
-		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode "
-				"0x%lx exceeds maximum size.",
-				(unsigned long long)vcn, idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ia->index +
-			le32_to_cpu(ia->index.entries_offset));
-	/*
-	 * Iterate similar to above big loop but applied to index buffer, thus
-	 * loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds checks. */
-		if ((u8*)ie < (u8*)ia || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->length) > index_end) {
-			ntfs_error(sb, "Index entry out of bounds in inode "
-					"0x%lx.", idx_ni->mft_no);
-			goto unm_err_out;
-		}
-		/*
-		 * The last entry cannot contain a key.  It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/* Further bounds checks. */
-		if ((u32)sizeof(INDEX_ENTRY_HEADER) +
-				le16_to_cpu(ie->key_length) >
-				le16_to_cpu(ie->data.vi.data_offset) ||
-				(u32)le16_to_cpu(ie->data.vi.data_offset) +
-				le16_to_cpu(ie->data.vi.data_length) >
-				le16_to_cpu(ie->length)) {
-			ntfs_error(sb, "Index entry out of bounds in inode "
-					"0x%lx.", idx_ni->mft_no);
-			goto unm_err_out;
-		}
-		/* If the keys match perfectly, we setup @ictx and return 0. */
-		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
-				&ie->key, key_len)) {
-ia_done:
-			ictx->is_in_root = false;
-			ictx->actx = NULL;
-			ictx->base_ni = NULL;
-			ictx->ia = ia;
-			ictx->page = page;
-			goto done;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
-				key_len, &ie->key, le16_to_cpu(ie->key_length));
-		/*
-		 * If @key collates before the key of the current entry, there
-		 * is definitely no such key in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/*
-		 * A match should never happen as the memcmp() call should have
-		 * cought it, but we still treat it correctly.
-		 */
-		if (!rc)
-			goto ia_done;
-		/* The keys are not equal, continue the search. */
-	}
-	/*
-	 * We have finished with this index buffer without success.  Check for
-	 * the presence of a child node and if not present return -ENOENT.
-	 */
-	if (!(ie->flags & INDEX_ENTRY_NODE)) {
-		ntfs_debug("Entry not found.");
-		err = -ENOENT;
-		goto ia_done;
-	}
-	if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
-		ntfs_error(sb, "Index entry with child node found in a leaf "
-				"node in inode 0x%lx.", idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* Child node present, descend into it. */
-	old_vcn = vcn;
-	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
-	if (vcn >= 0) {
-		/*
-		 * If vcn is in the same page cache page as old_vcn we recycle
-		 * the mapped page.
-		 */
-		if (old_vcn << vol->cluster_size_bits >>
-				PAGE_SHIFT == vcn <<
-				vol->cluster_size_bits >>
-				PAGE_SHIFT)
-			goto fast_descend_into_child_node;
-		unlock_page(page);
-		ntfs_unmap_page(page);
-		goto descend_into_child_node;
-	}
-	ntfs_error(sb, "Negative child node vcn in inode 0x%lx.",
-			idx_ni->mft_no);
-unm_err_out:
-	unlock_page(page);
-	ntfs_unmap_page(page);
-err_out:
-	if (!err)
-		err = -EIO;
-	if (actx)
-		ntfs_attr_put_search_ctx(actx);
-	if (m)
-		unmap_mft_record(base_ni);
-	return err;
-idx_err_out:
-	ntfs_error(sb, "Corrupt index.  Aborting lookup.");
-	goto err_out;
-}
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
deleted file mode 100644
index bb3c3ae55138..000000000000
--- a/fs/ntfs/index.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * index.h - Defines for NTFS kernel index handling.  Part of the Linux-NTFS
- *	     project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_INDEX_H
-#define _LINUX_NTFS_INDEX_H
-
-#include <linux/fs.h>
-
-#include "types.h"
-#include "layout.h"
-#include "inode.h"
-#include "attrib.h"
-#include "mft.h"
-#include "aops.h"
-
-/**
- * @idx_ni:	index inode containing the @entry described by this context
- * @entry:	index entry (points into @ir or @ia)
- * @data:	index entry data (points into @entry)
- * @data_len:	length in bytes of @data
- * @is_in_root:	'true' if @entry is in @ir and 'false' if it is in @ia
- * @ir:		index root if @is_in_root and NULL otherwise
- * @actx:	attribute search context if @is_in_root and NULL otherwise
- * @base_ni:	base inode if @is_in_root and NULL otherwise
- * @ia:		index block if @is_in_root is 'false' and NULL otherwise
- * @page:	page if @is_in_root is 'false' and NULL otherwise
- *
- * @idx_ni is the index inode this context belongs to.
- *
- * @entry is the index entry described by this context.  @data and @data_len
- * are the index entry data and its length in bytes, respectively.  @data
- * simply points into @entry.  This is probably what the user is interested in.
- *
- * If @is_in_root is 'true', @entry is in the index root attribute @ir described
- * by the attribute search context @actx and the base inode @base_ni.  @ia and
- * @page are NULL in this case.
- *
- * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia
- * and @page point to the index allocation block and the mapped, locked page it
- * is in, respectively.  @ir, @actx and @base_ni are NULL in this case.
- *
- * To obtain a context call ntfs_index_ctx_get().
- *
- * We use this context to allow ntfs_index_lookup() to return the found index
- * @entry and its @data without having to allocate a buffer and copy the @entry
- * and/or its @data into it.
- *
- * When finished with the @entry and its @data, call ntfs_index_ctx_put() to
- * free the context and other associated resources.
- *
- * If the index entry was modified, call flush_dcache_index_entry_page()
- * immediately after the modification and either ntfs_index_entry_mark_dirty()
- * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
- * ensure that the changes are written to disk.
- */
-typedef struct {
-	ntfs_inode *idx_ni;
-	INDEX_ENTRY *entry;
-	void *data;
-	u16 data_len;
-	bool is_in_root;
-	INDEX_ROOT *ir;
-	ntfs_attr_search_ctx *actx;
-	ntfs_inode *base_ni;
-	INDEX_ALLOCATION *ia;
-	struct page *page;
-} ntfs_index_context;
-
-extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni);
-extern void ntfs_index_ctx_put(ntfs_index_context *ictx);
-
-extern int ntfs_index_lookup(const void *key, const int key_len,
-		ntfs_index_context *ictx);
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries
- * @ictx:	ntfs index context describing the index entry
- *
- * Call flush_dcache_page() for the page in which an index entry resides.
- *
- * This must be called every time an index entry is modified, just after the
- * modification.
- *
- * If the index entry is in the index root attribute, simply flush the page
- * containing the mft record containing the index root attribute.
- *
- * If the index entry is in an index block belonging to the index allocation
- * attribute, simply flush the page cache page containing the index block.
- */
-static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
-{
-	if (ictx->is_in_root)
-		flush_dcache_mft_record_page(ictx->actx->ntfs_ino);
-	else
-		flush_dcache_page(ictx->page);
-}
-
-/**
- * ntfs_index_entry_mark_dirty - mark an index entry dirty
- * @ictx:	ntfs index context describing the index entry
- *
- * Mark the index entry described by the index entry context @ictx dirty.
- *
- * If the index entry is in the index root attribute, simply mark the mft
- * record containing the index root attribute dirty.  This ensures the mft
- * record, and hence the index root attribute, will be written out to disk
- * later.
- *
- * If the index entry is in an index block belonging to the index allocation
- * attribute, mark the buffers belonging to the index record as well as the
- * page cache page the index block is in dirty.  This automatically marks the
- * VFS inode of the ntfs index inode to which the index entry belongs dirty,
- * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
- * dirty index block, will be written out to disk later.
- */
-static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
-{
-	if (ictx->is_in_root)
-		mark_mft_record_dirty(ictx->actx->ntfs_ino);
-	else
-		mark_ntfs_record_dirty(ictx->page,
-				(u8*)ictx->ia - (u8*)page_address(ictx->page));
-}
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_INDEX_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
deleted file mode 100644
index aba1e22db4e9..000000000000
--- a/fs/ntfs/inode.c
+++ /dev/null
@@ -1,3102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * inode.c - NTFS kernel inode handling.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- */
-
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/mount.h>
-#include <linux/mutex.h>
-#include <linux/pagemap.h>
-#include <linux/quotaops.h>
-#include <linux/slab.h>
-#include <linux/log2.h>
-
-#include "aops.h"
-#include "attrib.h"
-#include "bitmap.h"
-#include "dir.h"
-#include "debug.h"
-#include "inode.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "time.h"
-#include "ntfs.h"
-
-/**
- * ntfs_test_inode - compare two (possibly fake) inodes for equality
- * @vi:		vfs inode which to test
- * @data:	data which is being tested with
- *
- * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
- * inode @vi for equality with the ntfs attribute @data.
- *
- * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
- * @na->name and @na->name_len are then ignored.
- *
- * Return 1 if the attributes match and 0 if not.
- *
- * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
- * allowed to sleep.
- */
-int ntfs_test_inode(struct inode *vi, void *data)
-{
-	ntfs_attr *na = (ntfs_attr *)data;
-	ntfs_inode *ni;
-
-	if (vi->i_ino != na->mft_no)
-		return 0;
-	ni = NTFS_I(vi);
-	/* If !NInoAttr(ni), @vi is a normal file or directory inode. */
-	if (likely(!NInoAttr(ni))) {
-		/* If not looking for a normal inode this is a mismatch. */
-		if (unlikely(na->type != AT_UNUSED))
-			return 0;
-	} else {
-		/* A fake inode describing an attribute. */
-		if (ni->type != na->type)
-			return 0;
-		if (ni->name_len != na->name_len)
-			return 0;
-		if (na->name_len && memcmp(ni->name, na->name,
-				na->name_len * sizeof(ntfschar)))
-			return 0;
-	}
-	/* Match! */
-	return 1;
-}
-
-/**
- * ntfs_init_locked_inode - initialize an inode
- * @vi:		vfs inode to initialize
- * @data:	data which to initialize @vi to
- *
- * Initialize the vfs inode @vi with the values from the ntfs attribute @data in
- * order to enable ntfs_test_inode() to do its work.
- *
- * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
- * In that case, @na->name and @na->name_len should be set to NULL and 0,
- * respectively. Although that is not strictly necessary as
- * ntfs_read_locked_inode() will fill them in later.
- *
- * Return 0 on success and -errno on error.
- *
- * NOTE: This function runs with the inode->i_lock spin lock held so it is not
- * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
- */
-static int ntfs_init_locked_inode(struct inode *vi, void *data)
-{
-	ntfs_attr *na = (ntfs_attr *)data;
-	ntfs_inode *ni = NTFS_I(vi);
-
-	vi->i_ino = na->mft_no;
-
-	ni->type = na->type;
-	if (na->type == AT_INDEX_ALLOCATION)
-		NInoSetMstProtected(ni);
-
-	ni->name = na->name;
-	ni->name_len = na->name_len;
-
-	/* If initializing a normal inode, we are done. */
-	if (likely(na->type == AT_UNUSED)) {
-		BUG_ON(na->name);
-		BUG_ON(na->name_len);
-		return 0;
-	}
-
-	/* It is a fake inode. */
-	NInoSetAttr(ni);
-
-	/*
-	 * We have I30 global constant as an optimization as it is the name
-	 * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC
-	 * allocation but that is ok. And most attributes are unnamed anyway,
-	 * thus the fraction of named attributes with name != I30 is actually
-	 * absolutely tiny.
-	 */
-	if (na->name_len && na->name != I30) {
-		unsigned int i;
-
-		BUG_ON(!na->name);
-		i = na->name_len * sizeof(ntfschar);
-		ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
-		if (!ni->name)
-			return -ENOMEM;
-		memcpy(ni->name, na->name, i);
-		ni->name[na->name_len] = 0;
-	}
-	return 0;
-}
-
-static int ntfs_read_locked_inode(struct inode *vi);
-static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
-static int ntfs_read_locked_index_inode(struct inode *base_vi,
-		struct inode *vi);
-
-/**
- * ntfs_iget - obtain a struct inode corresponding to a specific normal inode
- * @sb:		super block of mounted volume
- * @mft_no:	mft record number / inode number to obtain
- *
- * Obtain the struct inode corresponding to a specific normal inode (i.e. a
- * file or directory).
- *
- * If the inode is in the cache, it is just returned with an increased
- * reference count. Otherwise, a new struct inode is allocated and initialized,
- * and finally ntfs_read_locked_inode() is called to read in the inode and
- * fill in the remainder of the inode structure.
- *
- * Return the struct inode on success. Check the return value with IS_ERR() and
- * if true, the function failed and the error code is obtained from PTR_ERR().
- */
-struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
-{
-	struct inode *vi;
-	int err;
-	ntfs_attr na;
-
-	na.mft_no = mft_no;
-	na.type = AT_UNUSED;
-	na.name = NULL;
-	na.name_len = 0;
-
-	vi = iget5_locked(sb, mft_no, ntfs_test_inode,
-			ntfs_init_locked_inode, &na);
-	if (unlikely(!vi))
-		return ERR_PTR(-ENOMEM);
-
-	err = 0;
-
-	/* If this is a freshly allocated inode, need to read it now. */
-	if (vi->i_state & I_NEW) {
-		err = ntfs_read_locked_inode(vi);
-		unlock_new_inode(vi);
-	}
-	/*
-	 * There is no point in keeping bad inodes around if the failure was
-	 * due to ENOMEM. We want to be able to retry again later.
-	 */
-	if (unlikely(err == -ENOMEM)) {
-		iput(vi);
-		vi = ERR_PTR(err);
-	}
-	return vi;
-}
-
-/**
- * ntfs_attr_iget - obtain a struct inode corresponding to an attribute
- * @base_vi:	vfs base inode containing the attribute
- * @type:	attribute type
- * @name:	Unicode name of the attribute (NULL if unnamed)
- * @name_len:	length of @name in Unicode characters (0 if unnamed)
- *
- * Obtain the (fake) struct inode corresponding to the attribute specified by
- * @type, @name, and @name_len, which is present in the base mft record
- * specified by the vfs inode @base_vi.
- *
- * If the attribute inode is in the cache, it is just returned with an
- * increased reference count. Otherwise, a new struct inode is allocated and
- * initialized, and finally ntfs_read_locked_attr_inode() is called to read the
- * attribute and fill in the inode structure.
- *
- * Note, for index allocation attributes, you need to use ntfs_index_iget()
- * instead of ntfs_attr_iget() as working with indices is a lot more complex.
- *
- * Return the struct inode of the attribute inode on success. Check the return
- * value with IS_ERR() and if true, the function failed and the error code is
- * obtained from PTR_ERR().
- */
-struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
-		ntfschar *name, u32 name_len)
-{
-	struct inode *vi;
-	int err;
-	ntfs_attr na;
-
-	/* Make sure no one calls ntfs_attr_iget() for indices. */
-	BUG_ON(type == AT_INDEX_ALLOCATION);
-
-	na.mft_no = base_vi->i_ino;
-	na.type = type;
-	na.name = name;
-	na.name_len = name_len;
-
-	vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
-			ntfs_init_locked_inode, &na);
-	if (unlikely(!vi))
-		return ERR_PTR(-ENOMEM);
-
-	err = 0;
-
-	/* If this is a freshly allocated inode, need to read it now. */
-	if (vi->i_state & I_NEW) {
-		err = ntfs_read_locked_attr_inode(base_vi, vi);
-		unlock_new_inode(vi);
-	}
-	/*
-	 * There is no point in keeping bad attribute inodes around. This also
-	 * simplifies things in that we never need to check for bad attribute
-	 * inodes elsewhere.
-	 */
-	if (unlikely(err)) {
-		iput(vi);
-		vi = ERR_PTR(err);
-	}
-	return vi;
-}
-
-/**
- * ntfs_index_iget - obtain a struct inode corresponding to an index
- * @base_vi:	vfs base inode containing the index related attributes
- * @name:	Unicode name of the index
- * @name_len:	length of @name in Unicode characters
- *
- * Obtain the (fake) struct inode corresponding to the index specified by @name
- * and @name_len, which is present in the base mft record specified by the vfs
- * inode @base_vi.
- *
- * If the index inode is in the cache, it is just returned with an increased
- * reference count.  Otherwise, a new struct inode is allocated and
- * initialized, and finally ntfs_read_locked_index_inode() is called to read
- * the index related attributes and fill in the inode structure.
- *
- * Return the struct inode of the index inode on success. Check the return
- * value with IS_ERR() and if true, the function failed and the error code is
- * obtained from PTR_ERR().
- */
-struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
-		u32 name_len)
-{
-	struct inode *vi;
-	int err;
-	ntfs_attr na;
-
-	na.mft_no = base_vi->i_ino;
-	na.type = AT_INDEX_ALLOCATION;
-	na.name = name;
-	na.name_len = name_len;
-
-	vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
-			ntfs_init_locked_inode, &na);
-	if (unlikely(!vi))
-		return ERR_PTR(-ENOMEM);
-
-	err = 0;
-
-	/* If this is a freshly allocated inode, need to read it now. */
-	if (vi->i_state & I_NEW) {
-		err = ntfs_read_locked_index_inode(base_vi, vi);
-		unlock_new_inode(vi);
-	}
-	/*
-	 * There is no point in keeping bad index inodes around.  This also
-	 * simplifies things in that we never need to check for bad index
-	 * inodes elsewhere.
-	 */
-	if (unlikely(err)) {
-		iput(vi);
-		vi = ERR_PTR(err);
-	}
-	return vi;
-}
-
-struct inode *ntfs_alloc_big_inode(struct super_block *sb)
-{
-	ntfs_inode *ni;
-
-	ntfs_debug("Entering.");
-	ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS);
-	if (likely(ni != NULL)) {
-		ni->state = 0;
-		return VFS_I(ni);
-	}
-	ntfs_error(sb, "Allocation of NTFS big inode structure failed.");
-	return NULL;
-}
-
-void ntfs_free_big_inode(struct inode *inode)
-{
-	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
-}
-
-static inline ntfs_inode *ntfs_alloc_extent_inode(void)
-{
-	ntfs_inode *ni;
-
-	ntfs_debug("Entering.");
-	ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS);
-	if (likely(ni != NULL)) {
-		ni->state = 0;
-		return ni;
-	}
-	ntfs_error(NULL, "Allocation of NTFS inode structure failed.");
-	return NULL;
-}
-
-static void ntfs_destroy_extent_inode(ntfs_inode *ni)
-{
-	ntfs_debug("Entering.");
-	BUG_ON(ni->page);
-	if (!atomic_dec_and_test(&ni->count))
-		BUG();
-	kmem_cache_free(ntfs_inode_cache, ni);
-}
-
-/*
- * The attribute runlist lock has separate locking rules from the
- * normal runlist lock, so split the two lock-classes:
- */
-static struct lock_class_key attr_list_rl_lock_class;
-
-/**
- * __ntfs_init_inode - initialize ntfs specific part of an inode
- * @sb:		super block of mounted volume
- * @ni:		freshly allocated ntfs inode which to initialize
- *
- * Initialize an ntfs inode to defaults.
- *
- * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left
- * untouched. Make sure to initialize them elsewhere.
- *
- * Return zero on success and -ENOMEM on error.
- */
-void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
-{
-	ntfs_debug("Entering.");
-	rwlock_init(&ni->size_lock);
-	ni->initialized_size = ni->allocated_size = 0;
-	ni->seq_no = 0;
-	atomic_set(&ni->count, 1);
-	ni->vol = NTFS_SB(sb);
-	ntfs_init_runlist(&ni->runlist);
-	mutex_init(&ni->mrec_lock);
-	ni->page = NULL;
-	ni->page_ofs = 0;
-	ni->attr_list_size = 0;
-	ni->attr_list = NULL;
-	ntfs_init_runlist(&ni->attr_list_rl);
-	lockdep_set_class(&ni->attr_list_rl.lock,
-				&attr_list_rl_lock_class);
-	ni->itype.index.block_size = 0;
-	ni->itype.index.vcn_size = 0;
-	ni->itype.index.collation_rule = 0;
-	ni->itype.index.block_size_bits = 0;
-	ni->itype.index.vcn_size_bits = 0;
-	mutex_init(&ni->extent_lock);
-	ni->nr_extents = 0;
-	ni->ext.base_ntfs_ino = NULL;
-}
-
-/*
- * Extent inodes get MFT-mapped in a nested way, while the base inode
- * is still mapped. Teach this nesting to the lock validator by creating
- * a separate class for nested inode's mrec_lock's:
- */
-static struct lock_class_key extent_inode_mrec_lock_key;
-
-inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
-		unsigned long mft_no)
-{
-	ntfs_inode *ni = ntfs_alloc_extent_inode();
-
-	ntfs_debug("Entering.");
-	if (likely(ni != NULL)) {
-		__ntfs_init_inode(sb, ni);
-		lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key);
-		ni->mft_no = mft_no;
-		ni->type = AT_UNUSED;
-		ni->name = NULL;
-		ni->name_len = 0;
-	}
-	return ni;
-}
-
-/**
- * ntfs_is_extended_system_file - check if a file is in the $Extend directory
- * @ctx:	initialized attribute search context
- *
- * Search all file name attributes in the inode described by the attribute
- * search context @ctx and check if any of the names are in the $Extend system
- * directory.
- *
- * Return values:
- *	   1: file is in $Extend directory
- *	   0: file is not in $Extend directory
- *    -errno: failed to determine if the file is in the $Extend directory
- */
-static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx)
-{
-	int nr_links, err;
-
-	/* Restart search. */
-	ntfs_attr_reinit_search_ctx(ctx);
-
-	/* Get number of hard links. */
-	nr_links = le16_to_cpu(ctx->mrec->link_count);
-
-	/* Loop through all hard links. */
-	while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0,
-			ctx))) {
-		FILE_NAME_ATTR *file_name_attr;
-		ATTR_RECORD *attr = ctx->attr;
-		u8 *p, *p2;
-
-		nr_links--;
-		/*
-		 * Maximum sanity checking as we are called on an inode that
-		 * we suspect might be corrupt.
-		 */
-		p = (u8*)attr + le32_to_cpu(attr->length);
-		if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec +
-				le32_to_cpu(ctx->mrec->bytes_in_use)) {
-err_corrupt_attr:
-			ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name "
-					"attribute. You should run chkdsk.");
-			return -EIO;
-		}
-		if (attr->non_resident) {
-			ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file "
-					"name. You should run chkdsk.");
-			return -EIO;
-		}
-		if (attr->flags) {
-			ntfs_error(ctx->ntfs_ino->vol->sb, "File name with "
-					"invalid flags. You should run "
-					"chkdsk.");
-			return -EIO;
-		}
-		if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) {
-			ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file "
-					"name. You should run chkdsk.");
-			return -EIO;
-		}
-		file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
-				le16_to_cpu(attr->data.resident.value_offset));
-		p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length);
-		if (p2 < (u8*)attr || p2 > p)
-			goto err_corrupt_attr;
-		/* This attribute is ok, but is it in the $Extend directory? */
-		if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend)
-			return 1;	/* YES, it's an extended system file. */
-	}
-	if (unlikely(err != -ENOENT))
-		return err;
-	if (unlikely(nr_links)) {
-		ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count "
-				"doesn't match number of name attributes. You "
-				"should run chkdsk.");
-		return -EIO;
-	}
-	return 0;	/* NO, it is not an extended system file. */
-}
-
-/**
- * ntfs_read_locked_inode - read an inode from its device
- * @vi:		inode to read
- *
- * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode
- * described by @vi into memory from the device.
- *
- * The only fields in @vi that we need to/can look at when the function is
- * called are i_sb, pointing to the mounted device's super block, and i_ino,
- * the number of the inode to load.
- *
- * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino
- * for reading and sets up the necessary @vi fields as well as initializing
- * the ntfs inode.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- *    i_count is set to 1, so it is not going to go away
- *    i_flags is set to 0 and we have no business touching it.  Only an ioctl()
- *    is allowed to write to them. We should of course be honouring them but
- *    we need to do that using the IS_* macros defined in include/linux/fs.h.
- *    In any case ntfs_read_locked_inode() has nothing to do with i_flags.
- *
- * Return 0 on success and -errno on error.  In the error case, the inode will
- * have had make_bad_inode() executed on it.
- */
-static int ntfs_read_locked_inode(struct inode *vi)
-{
-	ntfs_volume *vol = NTFS_SB(vi->i_sb);
-	ntfs_inode *ni;
-	struct inode *bvi;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	STANDARD_INFORMATION *si;
-	ntfs_attr_search_ctx *ctx;
-	int err = 0;
-
-	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-
-	/* Setup the generic vfs inode parts now. */
-	vi->i_uid = vol->uid;
-	vi->i_gid = vol->gid;
-	vi->i_mode = 0;
-
-	/*
-	 * Initialize the ntfs specific part of @vi special casing
-	 * FILE_MFT which we need to do at mount time.
-	 */
-	if (vi->i_ino != FILE_MFT)
-		ntfs_init_big_inode(vi);
-	ni = NTFS_I(vi);
-
-	m = map_mft_record(ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-
-	if (!(m->flags & MFT_RECORD_IN_USE)) {
-		ntfs_error(vi->i_sb, "Inode is not in use!");
-		goto unm_err_out;
-	}
-	if (m->base_mft_record) {
-		ntfs_error(vi->i_sb, "Inode is an extent inode!");
-		goto unm_err_out;
-	}
-
-	/* Transfer information from mft record into vfs and ntfs inodes. */
-	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-
-	/*
-	 * FIXME: Keep in mind that link_count is two for files which have both
-	 * a long file name and a short file name as separate entries, so if
-	 * we are hiding short file names this will be too high. Either we need
-	 * to account for the short file names by subtracting them or we need
-	 * to make sure we delete files even though i_nlink is not zero which
-	 * might be tricky due to vfs interactions. Need to think about this
-	 * some more when implementing the unlink command.
-	 */
-	set_nlink(vi, le16_to_cpu(m->link_count));
-	/*
-	 * FIXME: Reparse points can have the directory bit set even though
-	 * they would be S_IFLNK. Need to deal with this further below when we
-	 * implement reparse points / symbolic links but it will do for now.
-	 * Also if not a directory, it could be something else, rather than
-	 * a regular file. But again, will do for now.
-	 */
-	/* Everyone gets all permissions. */
-	vi->i_mode |= S_IRWXUGO;
-	/* If read-only, no one gets write permissions. */
-	if (IS_RDONLY(vi))
-		vi->i_mode &= ~S_IWUGO;
-	if (m->flags & MFT_RECORD_IS_DIRECTORY) {
-		vi->i_mode |= S_IFDIR;
-		/*
-		 * Apply the directory permissions mask set in the mount
-		 * options.
-		 */
-		vi->i_mode &= ~vol->dmask;
-		/* Things break without this kludge! */
-		if (vi->i_nlink > 1)
-			set_nlink(vi, 1);
-	} else {
-		vi->i_mode |= S_IFREG;
-		/* Apply the file permissions mask set in the mount options. */
-		vi->i_mode &= ~vol->fmask;
-	}
-	/*
-	 * Find the standard information attribute in the mft record. At this
-	 * stage we haven't setup the attribute list stuff yet, so this could
-	 * in fact fail if the standard information is in an extent record, but
-	 * I don't think this actually ever happens.
-	 */
-	err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0,
-			ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			/*
-			 * TODO: We should be performing a hot fix here (if the
-			 * recover mount option is set) by creating a new
-			 * attribute.
-			 */
-			ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute "
-					"is missing.");
-		}
-		goto unm_err_out;
-	}
-	a = ctx->attr;
-	/* Get the standard information attribute value. */
-	if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset)
-			+ le32_to_cpu(a->data.resident.value_length) >
-			(u8 *)ctx->mrec + vol->mft_record_size) {
-		ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode.");
-		goto unm_err_out;
-	}
-	si = (STANDARD_INFORMATION*)((u8*)a +
-			le16_to_cpu(a->data.resident.value_offset));
-
-	/* Transfer information from the standard information into vi. */
-	/*
-	 * Note: The i_?times do not quite map perfectly onto the NTFS times,
-	 * but they are close enough, and in the end it doesn't really matter
-	 * that much...
-	 */
-	/*
-	 * mtime is the last change of the data within the file. Not changed
-	 * when only metadata is changed, e.g. a rename doesn't affect mtime.
-	 */
-	inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time));
-	/*
-	 * ctime is the last change of the metadata of the file. This obviously
-	 * always changes, when mtime is changed. ctime can be changed on its
-	 * own, mtime is then not changed, e.g. when a file is renamed.
-	 */
-	inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time));
-	/*
-	 * Last access to the data within the file. Not changed during a rename
-	 * for example but changed whenever the file is written to.
-	 */
-	inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time));
-
-	/* Find the attribute list attribute if present. */
-	ntfs_attr_reinit_search_ctx(ctx);
-	err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
-	if (err) {
-		if (unlikely(err != -ENOENT)) {
-			ntfs_error(vi->i_sb, "Failed to lookup attribute list "
-					"attribute.");
-			goto unm_err_out;
-		}
-	} else /* if (!err) */ {
-		if (vi->i_ino == FILE_MFT)
-			goto skip_attr_list_load;
-		ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
-		NInoSetAttrList(ni);
-		a = ctx->attr;
-		if (a->flags & ATTR_COMPRESSION_MASK) {
-			ntfs_error(vi->i_sb, "Attribute list attribute is "
-					"compressed.");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_IS_ENCRYPTED ||
-				a->flags & ATTR_IS_SPARSE) {
-			if (a->non_resident) {
-				ntfs_error(vi->i_sb, "Non-resident attribute "
-						"list attribute is encrypted/"
-						"sparse.");
-				goto unm_err_out;
-			}
-			ntfs_warning(vi->i_sb, "Resident attribute list "
-					"attribute in inode 0x%lx is marked "
-					"encrypted/sparse which is not true.  "
-					"However, Windows allows this and "
-					"chkdsk does not detect or correct it "
-					"so we will just ignore the invalid "
-					"flags and pretend they are not set.",
-					vi->i_ino);
-		}
-		/* Now allocate memory for the attribute list. */
-		ni->attr_list_size = (u32)ntfs_attr_size(a);
-		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
-		if (!ni->attr_list) {
-			ntfs_error(vi->i_sb, "Not enough memory to allocate "
-					"buffer for attribute list.");
-			err = -ENOMEM;
-			goto unm_err_out;
-		}
-		if (a->non_resident) {
-			NInoSetAttrListNonResident(ni);
-			if (a->data.non_resident.lowest_vcn) {
-				ntfs_error(vi->i_sb, "Attribute list has non "
-						"zero lowest_vcn.");
-				goto unm_err_out;
-			}
-			/*
-			 * Setup the runlist. No need for locking as we have
-			 * exclusive access to the inode at this time.
-			 */
-			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
-					a, NULL);
-			if (IS_ERR(ni->attr_list_rl.rl)) {
-				err = PTR_ERR(ni->attr_list_rl.rl);
-				ni->attr_list_rl.rl = NULL;
-				ntfs_error(vi->i_sb, "Mapping pairs "
-						"decompression failed.");
-				goto unm_err_out;
-			}
-			/* Now load the attribute list. */
-			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
-					ni->attr_list, ni->attr_list_size,
-					sle64_to_cpu(a->data.non_resident.
-					initialized_size)))) {
-				ntfs_error(vi->i_sb, "Failed to load "
-						"attribute list attribute.");
-				goto unm_err_out;
-			}
-		} else /* if (!a->non_resident) */ {
-			if ((u8*)a + le16_to_cpu(a->data.resident.value_offset)
-					+ le32_to_cpu(
-					a->data.resident.value_length) >
-					(u8*)ctx->mrec + vol->mft_record_size) {
-				ntfs_error(vi->i_sb, "Corrupt attribute list "
-						"in inode.");
-				goto unm_err_out;
-			}
-			/* Now copy the attribute list. */
-			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
-					a->data.resident.value_offset),
-					le32_to_cpu(
-					a->data.resident.value_length));
-		}
-	}
-skip_attr_list_load:
-	/*
-	 * If an attribute list is present we now have the attribute list value
-	 * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
-	 */
-	if (S_ISDIR(vi->i_mode)) {
-		loff_t bvi_size;
-		ntfs_inode *bni;
-		INDEX_ROOT *ir;
-		u8 *ir_end, *index_end;
-
-		/* It is a directory, find index root attribute. */
-		ntfs_attr_reinit_search_ctx(ctx);
-		err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE,
-				0, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT) {
-				// FIXME: File is corrupt! Hot-fix with empty
-				// index root attribute if recovery option is
-				// set.
-				ntfs_error(vi->i_sb, "$INDEX_ROOT attribute "
-						"is missing.");
-			}
-			goto unm_err_out;
-		}
-		a = ctx->attr;
-		/* Set up the state. */
-		if (unlikely(a->non_resident)) {
-			ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
-					"resident.");
-			goto unm_err_out;
-		}
-		/* Ensure the attribute name is placed before the value. */
-		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-				le16_to_cpu(a->data.resident.value_offset)))) {
-			ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
-					"placed after the attribute value.");
-			goto unm_err_out;
-		}
-		/*
-		 * Compressed/encrypted index root just means that the newly
-		 * created files in that directory should be created compressed/
-		 * encrypted. However index root cannot be both compressed and
-		 * encrypted.
-		 */
-		if (a->flags & ATTR_COMPRESSION_MASK)
-			NInoSetCompressed(ni);
-		if (a->flags & ATTR_IS_ENCRYPTED) {
-			if (a->flags & ATTR_COMPRESSION_MASK) {
-				ntfs_error(vi->i_sb, "Found encrypted and "
-						"compressed attribute.");
-				goto unm_err_out;
-			}
-			NInoSetEncrypted(ni);
-		}
-		if (a->flags & ATTR_IS_SPARSE)
-			NInoSetSparse(ni);
-		ir = (INDEX_ROOT*)((u8*)a +
-				le16_to_cpu(a->data.resident.value_offset));
-		ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
-		if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
-			ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
-					"corrupt.");
-			goto unm_err_out;
-		}
-		index_end = (u8*)&ir->index +
-				le32_to_cpu(ir->index.index_length);
-		if (index_end > ir_end) {
-			ntfs_error(vi->i_sb, "Directory index is corrupt.");
-			goto unm_err_out;
-		}
-		if (ir->type != AT_FILE_NAME) {
-			ntfs_error(vi->i_sb, "Indexed attribute is not "
-					"$FILE_NAME.");
-			goto unm_err_out;
-		}
-		if (ir->collation_rule != COLLATION_FILE_NAME) {
-			ntfs_error(vi->i_sb, "Index collation rule is not "
-					"COLLATION_FILE_NAME.");
-			goto unm_err_out;
-		}
-		ni->itype.index.collation_rule = ir->collation_rule;
-		ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
-		if (ni->itype.index.block_size &
-				(ni->itype.index.block_size - 1)) {
-			ntfs_error(vi->i_sb, "Index block size (%u) is not a "
-					"power of two.",
-					ni->itype.index.block_size);
-			goto unm_err_out;
-		}
-		if (ni->itype.index.block_size > PAGE_SIZE) {
-			ntfs_error(vi->i_sb, "Index block size (%u) > "
-					"PAGE_SIZE (%ld) is not "
-					"supported.  Sorry.",
-					ni->itype.index.block_size,
-					PAGE_SIZE);
-			err = -EOPNOTSUPP;
-			goto unm_err_out;
-		}
-		if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
-			ntfs_error(vi->i_sb, "Index block size (%u) < "
-					"NTFS_BLOCK_SIZE (%i) is not "
-					"supported.  Sorry.",
-					ni->itype.index.block_size,
-					NTFS_BLOCK_SIZE);
-			err = -EOPNOTSUPP;
-			goto unm_err_out;
-		}
-		ni->itype.index.block_size_bits =
-				ffs(ni->itype.index.block_size) - 1;
-		/* Determine the size of a vcn in the directory index. */
-		if (vol->cluster_size <= ni->itype.index.block_size) {
-			ni->itype.index.vcn_size = vol->cluster_size;
-			ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
-		} else {
-			ni->itype.index.vcn_size = vol->sector_size;
-			ni->itype.index.vcn_size_bits = vol->sector_size_bits;
-		}
-
-		/* Setup the index allocation attribute, even if not present. */
-		NInoSetMstProtected(ni);
-		ni->type = AT_INDEX_ALLOCATION;
-		ni->name = I30;
-		ni->name_len = 4;
-
-		if (!(ir->index.flags & LARGE_INDEX)) {
-			/* No index allocation. */
-			vi->i_size = ni->initialized_size =
-					ni->allocated_size = 0;
-			/* We are done with the mft record, so we release it. */
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(ni);
-			m = NULL;
-			ctx = NULL;
-			goto skip_large_dir_stuff;
-		} /* LARGE_INDEX: Index allocation present. Setup state. */
-		NInoSetIndexAllocPresent(ni);
-		/* Find index allocation attribute. */
-		ntfs_attr_reinit_search_ctx(ctx);
-		err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4,
-				CASE_SENSITIVE, 0, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT)
-				ntfs_error(vi->i_sb, "$INDEX_ALLOCATION "
-						"attribute is not present but "
-						"$INDEX_ROOT indicated it is.");
-			else
-				ntfs_error(vi->i_sb, "Failed to lookup "
-						"$INDEX_ALLOCATION "
-						"attribute.");
-			goto unm_err_out;
-		}
-		a = ctx->attr;
-		if (!a->non_resident) {
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
-					"is resident.");
-			goto unm_err_out;
-		}
-		/*
-		 * Ensure the attribute name is placed before the mapping pairs
-		 * array.
-		 */
-		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-				le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset)))) {
-			ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
-					"is placed after the mapping pairs "
-					"array.");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_IS_ENCRYPTED) {
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
-					"is encrypted.");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_IS_SPARSE) {
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
-					"is sparse.");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_COMPRESSION_MASK) {
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
-					"is compressed.");
-			goto unm_err_out;
-		}
-		if (a->data.non_resident.lowest_vcn) {
-			ntfs_error(vi->i_sb, "First extent of "
-					"$INDEX_ALLOCATION attribute has non "
-					"zero lowest_vcn.");
-			goto unm_err_out;
-		}
-		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
-		ni->initialized_size = sle64_to_cpu(
-				a->data.non_resident.initialized_size);
-		ni->allocated_size = sle64_to_cpu(
-				a->data.non_resident.allocated_size);
-		/*
-		 * We are done with the mft record, so we release it. Otherwise
-		 * we would deadlock in ntfs_attr_iget().
-		 */
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(ni);
-		m = NULL;
-		ctx = NULL;
-		/* Get the index bitmap attribute inode. */
-		bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4);
-		if (IS_ERR(bvi)) {
-			ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
-			err = PTR_ERR(bvi);
-			goto unm_err_out;
-		}
-		bni = NTFS_I(bvi);
-		if (NInoCompressed(bni) || NInoEncrypted(bni) ||
-				NInoSparse(bni)) {
-			ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
-					"and/or encrypted and/or sparse.");
-			goto iput_unm_err_out;
-		}
-		/* Consistency check bitmap size vs. index allocation size. */
-		bvi_size = i_size_read(bvi);
-		if ((bvi_size << 3) < (vi->i_size >>
-				ni->itype.index.block_size_bits)) {
-			ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
-					"for index allocation (0x%llx).",
-					bvi_size << 3, vi->i_size);
-			goto iput_unm_err_out;
-		}
-		/* No longer need the bitmap attribute inode. */
-		iput(bvi);
-skip_large_dir_stuff:
-		/* Setup the operations for this inode. */
-		vi->i_op = &ntfs_dir_inode_ops;
-		vi->i_fop = &ntfs_dir_ops;
-		vi->i_mapping->a_ops = &ntfs_mst_aops;
-	} else {
-		/* It is a file. */
-		ntfs_attr_reinit_search_ctx(ctx);
-
-		/* Setup the data attribute, even if not present. */
-		ni->type = AT_DATA;
-		ni->name = NULL;
-		ni->name_len = 0;
-
-		/* Find first extent of the unnamed data attribute. */
-		err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx);
-		if (unlikely(err)) {
-			vi->i_size = ni->initialized_size =
-					ni->allocated_size = 0;
-			if (err != -ENOENT) {
-				ntfs_error(vi->i_sb, "Failed to lookup $DATA "
-						"attribute.");
-				goto unm_err_out;
-			}
-			/*
-			 * FILE_Secure does not have an unnamed $DATA
-			 * attribute, so we special case it here.
-			 */
-			if (vi->i_ino == FILE_Secure)
-				goto no_data_attr_special_case;
-			/*
-			 * Most if not all the system files in the $Extend
-			 * system directory do not have unnamed data
-			 * attributes so we need to check if the parent
-			 * directory of the file is FILE_Extend and if it is
-			 * ignore this error. To do this we need to get the
-			 * name of this inode from the mft record as the name
-			 * contains the back reference to the parent directory.
-			 */
-			if (ntfs_is_extended_system_file(ctx) > 0)
-				goto no_data_attr_special_case;
-			// FIXME: File is corrupt! Hot-fix with empty data
-			// attribute if recovery option is set.
-			ntfs_error(vi->i_sb, "$DATA attribute is missing.");
-			goto unm_err_out;
-		}
-		a = ctx->attr;
-		/* Setup the state. */
-		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
-			if (a->flags & ATTR_COMPRESSION_MASK) {
-				NInoSetCompressed(ni);
-				if (vol->cluster_size > 4096) {
-					ntfs_error(vi->i_sb, "Found "
-							"compressed data but "
-							"compression is "
-							"disabled due to "
-							"cluster size (%i) > "
-							"4kiB.",
-							vol->cluster_size);
-					goto unm_err_out;
-				}
-				if ((a->flags & ATTR_COMPRESSION_MASK)
-						!= ATTR_IS_COMPRESSED) {
-					ntfs_error(vi->i_sb, "Found unknown "
-							"compression method "
-							"or corrupt file.");
-					goto unm_err_out;
-				}
-			}
-			if (a->flags & ATTR_IS_SPARSE)
-				NInoSetSparse(ni);
-		}
-		if (a->flags & ATTR_IS_ENCRYPTED) {
-			if (NInoCompressed(ni)) {
-				ntfs_error(vi->i_sb, "Found encrypted and "
-						"compressed data.");
-				goto unm_err_out;
-			}
-			NInoSetEncrypted(ni);
-		}
-		if (a->non_resident) {
-			NInoSetNonResident(ni);
-			if (NInoCompressed(ni) || NInoSparse(ni)) {
-				if (NInoCompressed(ni) && a->data.non_resident.
-						compression_unit != 4) {
-					ntfs_error(vi->i_sb, "Found "
-							"non-standard "
-							"compression unit (%u "
-							"instead of 4).  "
-							"Cannot handle this.",
-							a->data.non_resident.
-							compression_unit);
-					err = -EOPNOTSUPP;
-					goto unm_err_out;
-				}
-				if (a->data.non_resident.compression_unit) {
-					ni->itype.compressed.block_size = 1U <<
-							(a->data.non_resident.
-							compression_unit +
-							vol->cluster_size_bits);
-					ni->itype.compressed.block_size_bits =
-							ffs(ni->itype.
-							compressed.
-							block_size) - 1;
-					ni->itype.compressed.block_clusters =
-							1U << a->data.
-							non_resident.
-							compression_unit;
-				} else {
-					ni->itype.compressed.block_size = 0;
-					ni->itype.compressed.block_size_bits =
-							0;
-					ni->itype.compressed.block_clusters =
-							0;
-				}
-				ni->itype.compressed.size = sle64_to_cpu(
-						a->data.non_resident.
-						compressed_size);
-			}
-			if (a->data.non_resident.lowest_vcn) {
-				ntfs_error(vi->i_sb, "First extent of $DATA "
-						"attribute has non zero "
-						"lowest_vcn.");
-				goto unm_err_out;
-			}
-			vi->i_size = sle64_to_cpu(
-					a->data.non_resident.data_size);
-			ni->initialized_size = sle64_to_cpu(
-					a->data.non_resident.initialized_size);
-			ni->allocated_size = sle64_to_cpu(
-					a->data.non_resident.allocated_size);
-		} else { /* Resident attribute. */
-			vi->i_size = ni->initialized_size = le32_to_cpu(
-					a->data.resident.value_length);
-			ni->allocated_size = le32_to_cpu(a->length) -
-					le16_to_cpu(
-					a->data.resident.value_offset);
-			if (vi->i_size > ni->allocated_size) {
-				ntfs_error(vi->i_sb, "Resident data attribute "
-						"is corrupt (size exceeds "
-						"allocation).");
-				goto unm_err_out;
-			}
-		}
-no_data_attr_special_case:
-		/* We are done with the mft record, so we release it. */
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(ni);
-		m = NULL;
-		ctx = NULL;
-		/* Setup the operations for this inode. */
-		vi->i_op = &ntfs_file_inode_ops;
-		vi->i_fop = &ntfs_file_ops;
-		vi->i_mapping->a_ops = &ntfs_normal_aops;
-		if (NInoMstProtected(ni))
-			vi->i_mapping->a_ops = &ntfs_mst_aops;
-		else if (NInoCompressed(ni))
-			vi->i_mapping->a_ops = &ntfs_compressed_aops;
-	}
-	/*
-	 * The number of 512-byte blocks used on disk (for stat). This is in so
-	 * far inaccurate as it doesn't account for any named streams or other
-	 * special non-resident attributes, but that is how Windows works, too,
-	 * so we are at least consistent with Windows, if not entirely
-	 * consistent with the Linux Way. Doing it the Linux Way would cause a
-	 * significant slowdown as it would involve iterating over all
-	 * attributes in the mft record and adding the allocated/compressed
-	 * sizes of all non-resident attributes present to give us the Linux
-	 * correct size that should go into i_blocks (after division by 512).
-	 */
-	if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni)))
-		vi->i_blocks = ni->itype.compressed.size >> 9;
-	else
-		vi->i_blocks = ni->allocated_size >> 9;
-	ntfs_debug("Done.");
-	return 0;
-iput_unm_err_out:
-	iput(bvi);
-unm_err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(ni);
-err_out:
-	ntfs_error(vol->sb, "Failed with error code %i.  Marking corrupt "
-			"inode 0x%lx as bad.  Run chkdsk.", err, vi->i_ino);
-	make_bad_inode(vi);
-	if (err != -EOPNOTSUPP && err != -ENOMEM)
-		NVolSetErrors(vol);
-	return err;
-}
-
-/**
- * ntfs_read_locked_attr_inode - read an attribute inode from its base inode
- * @base_vi:	base inode
- * @vi:		attribute inode to read
- *
- * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the
- * attribute inode described by @vi into memory from the base mft record
- * described by @base_ni.
- *
- * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for
- * reading and looks up the attribute described by @vi before setting up the
- * necessary fields in @vi as well as initializing the ntfs inode.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- *    i_count is set to 1, so it is not going to go away
- *
- * Return 0 on success and -errno on error.  In the error case, the inode will
- * have had make_bad_inode() executed on it.
- *
- * Note this cannot be called for AT_INDEX_ALLOCATION.
- */
-static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
-{
-	ntfs_volume *vol = NTFS_SB(vi->i_sb);
-	ntfs_inode *ni, *base_ni;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	int err = 0;
-
-	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-
-	ntfs_init_big_inode(vi);
-
-	ni	= NTFS_I(vi);
-	base_ni = NTFS_I(base_vi);
-
-	/* Just mirror the values from the base inode. */
-	vi->i_uid	= base_vi->i_uid;
-	vi->i_gid	= base_vi->i_gid;
-	set_nlink(vi, base_vi->i_nlink);
-	inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
-	inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
-	inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
-	vi->i_generation = ni->seq_no = base_ni->seq_no;
-
-	/* Set inode type to zero but preserve permissions. */
-	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
-
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-	/* Find the attribute. */
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err))
-		goto unm_err_out;
-	a = ctx->attr;
-	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
-		if (a->flags & ATTR_COMPRESSION_MASK) {
-			NInoSetCompressed(ni);
-			if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
-					ni->name_len)) {
-				ntfs_error(vi->i_sb, "Found compressed "
-						"non-data or named data "
-						"attribute.  Please report "
-						"you saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				goto unm_err_out;
-			}
-			if (vol->cluster_size > 4096) {
-				ntfs_error(vi->i_sb, "Found compressed "
-						"attribute but compression is "
-						"disabled due to cluster size "
-						"(%i) > 4kiB.",
-						vol->cluster_size);
-				goto unm_err_out;
-			}
-			if ((a->flags & ATTR_COMPRESSION_MASK) !=
-					ATTR_IS_COMPRESSED) {
-				ntfs_error(vi->i_sb, "Found unknown "
-						"compression method.");
-				goto unm_err_out;
-			}
-		}
-		/*
-		 * The compressed/sparse flag set in an index root just means
-		 * to compress all files.
-		 */
-		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
-			ntfs_error(vi->i_sb, "Found mst protected attribute "
-					"but the attribute is %s.  Please "
-					"report you saw this message to "
-					"linux-ntfs-dev@lists.sourceforge.net",
-					NInoCompressed(ni) ? "compressed" :
-					"sparse");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_IS_SPARSE)
-			NInoSetSparse(ni);
-	}
-	if (a->flags & ATTR_IS_ENCRYPTED) {
-		if (NInoCompressed(ni)) {
-			ntfs_error(vi->i_sb, "Found encrypted and compressed "
-					"data.");
-			goto unm_err_out;
-		}
-		/*
-		 * The encryption flag set in an index root just means to
-		 * encrypt all files.
-		 */
-		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
-			ntfs_error(vi->i_sb, "Found mst protected attribute "
-					"but the attribute is encrypted.  "
-					"Please report you saw this message "
-					"to linux-ntfs-dev@lists.sourceforge."
-					"net");
-			goto unm_err_out;
-		}
-		if (ni->type != AT_DATA) {
-			ntfs_error(vi->i_sb, "Found encrypted non-data "
-					"attribute.");
-			goto unm_err_out;
-		}
-		NInoSetEncrypted(ni);
-	}
-	if (!a->non_resident) {
-		/* Ensure the attribute name is placed before the value. */
-		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-				le16_to_cpu(a->data.resident.value_offset)))) {
-			ntfs_error(vol->sb, "Attribute name is placed after "
-					"the attribute value.");
-			goto unm_err_out;
-		}
-		if (NInoMstProtected(ni)) {
-			ntfs_error(vi->i_sb, "Found mst protected attribute "
-					"but the attribute is resident.  "
-					"Please report you saw this message to "
-					"linux-ntfs-dev@lists.sourceforge.net");
-			goto unm_err_out;
-		}
-		vi->i_size = ni->initialized_size = le32_to_cpu(
-				a->data.resident.value_length);
-		ni->allocated_size = le32_to_cpu(a->length) -
-				le16_to_cpu(a->data.resident.value_offset);
-		if (vi->i_size > ni->allocated_size) {
-			ntfs_error(vi->i_sb, "Resident attribute is corrupt "
-					"(size exceeds allocation).");
-			goto unm_err_out;
-		}
-	} else {
-		NInoSetNonResident(ni);
-		/*
-		 * Ensure the attribute name is placed before the mapping pairs
-		 * array.
-		 */
-		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-				le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset)))) {
-			ntfs_error(vol->sb, "Attribute name is placed after "
-					"the mapping pairs array.");
-			goto unm_err_out;
-		}
-		if (NInoCompressed(ni) || NInoSparse(ni)) {
-			if (NInoCompressed(ni) && a->data.non_resident.
-					compression_unit != 4) {
-				ntfs_error(vi->i_sb, "Found non-standard "
-						"compression unit (%u instead "
-						"of 4).  Cannot handle this.",
-						a->data.non_resident.
-						compression_unit);
-				err = -EOPNOTSUPP;
-				goto unm_err_out;
-			}
-			if (a->data.non_resident.compression_unit) {
-				ni->itype.compressed.block_size = 1U <<
-						(a->data.non_resident.
-						compression_unit +
-						vol->cluster_size_bits);
-				ni->itype.compressed.block_size_bits =
-						ffs(ni->itype.compressed.
-						block_size) - 1;
-				ni->itype.compressed.block_clusters = 1U <<
-						a->data.non_resident.
-						compression_unit;
-			} else {
-				ni->itype.compressed.block_size = 0;
-				ni->itype.compressed.block_size_bits = 0;
-				ni->itype.compressed.block_clusters = 0;
-			}
-			ni->itype.compressed.size = sle64_to_cpu(
-					a->data.non_resident.compressed_size);
-		}
-		if (a->data.non_resident.lowest_vcn) {
-			ntfs_error(vi->i_sb, "First extent of attribute has "
-					"non-zero lowest_vcn.");
-			goto unm_err_out;
-		}
-		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
-		ni->initialized_size = sle64_to_cpu(
-				a->data.non_resident.initialized_size);
-		ni->allocated_size = sle64_to_cpu(
-				a->data.non_resident.allocated_size);
-	}
-	vi->i_mapping->a_ops = &ntfs_normal_aops;
-	if (NInoMstProtected(ni))
-		vi->i_mapping->a_ops = &ntfs_mst_aops;
-	else if (NInoCompressed(ni))
-		vi->i_mapping->a_ops = &ntfs_compressed_aops;
-	if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
-		vi->i_blocks = ni->itype.compressed.size >> 9;
-	else
-		vi->i_blocks = ni->allocated_size >> 9;
-	/*
-	 * Make sure the base inode does not go away and attach it to the
-	 * attribute inode.
-	 */
-	igrab(base_vi);
-	ni->ext.base_ntfs_ino = base_ni;
-	ni->nr_extents = -1;
-
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-
-	ntfs_debug("Done.");
-	return 0;
-
-unm_err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-err_out:
-	ntfs_error(vol->sb, "Failed with error code %i while reading attribute "
-			"inode (mft_no 0x%lx, type 0x%x, name_len %i).  "
-			"Marking corrupt inode and base inode 0x%lx as bad.  "
-			"Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
-			base_vi->i_ino);
-	make_bad_inode(vi);
-	if (err != -ENOMEM)
-		NVolSetErrors(vol);
-	return err;
-}
-
-/**
- * ntfs_read_locked_index_inode - read an index inode from its base inode
- * @base_vi:	base inode
- * @vi:		index inode to read
- *
- * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the
- * index inode described by @vi into memory from the base mft record described
- * by @base_ni.
- *
- * ntfs_read_locked_index_inode() maps, pins and locks the base inode for
- * reading and looks up the attributes relating to the index described by @vi
- * before setting up the necessary fields in @vi as well as initializing the
- * ntfs inode.
- *
- * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
- * with the attribute type set to AT_INDEX_ALLOCATION.  Apart from that, they
- * are setup like directory inodes since directories are a special case of
- * indices ao they need to be treated in much the same way.  Most importantly,
- * for small indices the index allocation attribute might not actually exist.
- * However, the index root attribute always exists but this does not need to
- * have an inode associated with it and this is why we define a new inode type
- * index.  Also, like for directories, we need to have an attribute inode for
- * the bitmap attribute corresponding to the index allocation attribute and we
- * can store this in the appropriate field of the inode, just like we do for
- * normal directory inodes.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- *    i_count is set to 1, so it is not going to go away
- *
- * Return 0 on success and -errno on error.  In the error case, the inode will
- * have had make_bad_inode() executed on it.
- */
-static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
-{
-	loff_t bvi_size;
-	ntfs_volume *vol = NTFS_SB(vi->i_sb);
-	ntfs_inode *ni, *base_ni, *bni;
-	struct inode *bvi;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	INDEX_ROOT *ir;
-	u8 *ir_end, *index_end;
-	int err = 0;
-
-	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-	ntfs_init_big_inode(vi);
-	ni	= NTFS_I(vi);
-	base_ni = NTFS_I(base_vi);
-	/* Just mirror the values from the base inode. */
-	vi->i_uid	= base_vi->i_uid;
-	vi->i_gid	= base_vi->i_gid;
-	set_nlink(vi, base_vi->i_nlink);
-	inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
-	inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
-	inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
-	vi->i_generation = ni->seq_no = base_ni->seq_no;
-	/* Set inode type to zero but preserve permissions. */
-	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
-	/* Map the mft record for the base inode. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-	/* Find the index root attribute. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
-					"missing.");
-		goto unm_err_out;
-	}
-	a = ctx->attr;
-	/* Set up the state. */
-	if (unlikely(a->non_resident)) {
-		ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
-		goto unm_err_out;
-	}
-	/* Ensure the attribute name is placed before the value. */
-	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-			le16_to_cpu(a->data.resident.value_offset)))) {
-		ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
-				"after the attribute value.");
-		goto unm_err_out;
-	}
-	/*
-	 * Compressed/encrypted/sparse index root is not allowed, except for
-	 * directories of course but those are not dealt with here.
-	 */
-	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
-			ATTR_IS_SPARSE)) {
-		ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
-				"root attribute.");
-		goto unm_err_out;
-	}
-	ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset));
-	ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
-	if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
-		ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
-		goto unm_err_out;
-	}
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	if (index_end > ir_end) {
-		ntfs_error(vi->i_sb, "Index is corrupt.");
-		goto unm_err_out;
-	}
-	if (ir->type) {
-		ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).",
-				le32_to_cpu(ir->type));
-		goto unm_err_out;
-	}
-	ni->itype.index.collation_rule = ir->collation_rule;
-	ntfs_debug("Index collation rule is 0x%x.",
-			le32_to_cpu(ir->collation_rule));
-	ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
-	if (!is_power_of_2(ni->itype.index.block_size)) {
-		ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
-				"two.", ni->itype.index.block_size);
-		goto unm_err_out;
-	}
-	if (ni->itype.index.block_size > PAGE_SIZE) {
-		ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE "
-				"(%ld) is not supported.  Sorry.",
-				ni->itype.index.block_size, PAGE_SIZE);
-		err = -EOPNOTSUPP;
-		goto unm_err_out;
-	}
-	if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
-		ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE "
-				"(%i) is not supported.  Sorry.",
-				ni->itype.index.block_size, NTFS_BLOCK_SIZE);
-		err = -EOPNOTSUPP;
-		goto unm_err_out;
-	}
-	ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1;
-	/* Determine the size of a vcn in the index. */
-	if (vol->cluster_size <= ni->itype.index.block_size) {
-		ni->itype.index.vcn_size = vol->cluster_size;
-		ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
-	} else {
-		ni->itype.index.vcn_size = vol->sector_size;
-		ni->itype.index.vcn_size_bits = vol->sector_size_bits;
-	}
-	/* Check for presence of index allocation attribute. */
-	if (!(ir->index.flags & LARGE_INDEX)) {
-		/* No index allocation. */
-		vi->i_size = ni->initialized_size = ni->allocated_size = 0;
-		/* We are done with the mft record, so we release it. */
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-		m = NULL;
-		ctx = NULL;
-		goto skip_large_index_stuff;
-	} /* LARGE_INDEX:  Index allocation present.  Setup state. */
-	NInoSetIndexAllocPresent(ni);
-	/* Find index allocation attribute. */
-	ntfs_attr_reinit_search_ctx(ctx);
-	err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
-					"not present but $INDEX_ROOT "
-					"indicated it is.");
-		else
-			ntfs_error(vi->i_sb, "Failed to lookup "
-					"$INDEX_ALLOCATION attribute.");
-		goto unm_err_out;
-	}
-	a = ctx->attr;
-	if (!a->non_resident) {
-		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
-				"resident.");
-		goto unm_err_out;
-	}
-	/*
-	 * Ensure the attribute name is placed before the mapping pairs array.
-	 */
-	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-			le16_to_cpu(
-			a->data.non_resident.mapping_pairs_offset)))) {
-		ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
-				"placed after the mapping pairs array.");
-		goto unm_err_out;
-	}
-	if (a->flags & ATTR_IS_ENCRYPTED) {
-		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
-				"encrypted.");
-		goto unm_err_out;
-	}
-	if (a->flags & ATTR_IS_SPARSE) {
-		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
-		goto unm_err_out;
-	}
-	if (a->flags & ATTR_COMPRESSION_MASK) {
-		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
-				"compressed.");
-		goto unm_err_out;
-	}
-	if (a->data.non_resident.lowest_vcn) {
-		ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
-				"attribute has non zero lowest_vcn.");
-		goto unm_err_out;
-	}
-	vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
-	ni->initialized_size = sle64_to_cpu(
-			a->data.non_resident.initialized_size);
-	ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size);
-	/*
-	 * We are done with the mft record, so we release it.  Otherwise
-	 * we would deadlock in ntfs_attr_iget().
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	m = NULL;
-	ctx = NULL;
-	/* Get the index bitmap attribute inode. */
-	bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len);
-	if (IS_ERR(bvi)) {
-		ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
-		err = PTR_ERR(bvi);
-		goto unm_err_out;
-	}
-	bni = NTFS_I(bvi);
-	if (NInoCompressed(bni) || NInoEncrypted(bni) ||
-			NInoSparse(bni)) {
-		ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or "
-				"encrypted and/or sparse.");
-		goto iput_unm_err_out;
-	}
-	/* Consistency check bitmap size vs. index allocation size. */
-	bvi_size = i_size_read(bvi);
-	if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) {
-		ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
-				"index allocation (0x%llx).", bvi_size << 3,
-				vi->i_size);
-		goto iput_unm_err_out;
-	}
-	iput(bvi);
-skip_large_index_stuff:
-	/* Setup the operations for this index inode. */
-	vi->i_mapping->a_ops = &ntfs_mst_aops;
-	vi->i_blocks = ni->allocated_size >> 9;
-	/*
-	 * Make sure the base inode doesn't go away and attach it to the
-	 * index inode.
-	 */
-	igrab(base_vi);
-	ni->ext.base_ntfs_ino = base_ni;
-	ni->nr_extents = -1;
-
-	ntfs_debug("Done.");
-	return 0;
-iput_unm_err_out:
-	iput(bvi);
-unm_err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-err_out:
-	ntfs_error(vi->i_sb, "Failed with error code %i while reading index "
-			"inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino,
-			ni->name_len);
-	make_bad_inode(vi);
-	if (err != -EOPNOTSUPP && err != -ENOMEM)
-		NVolSetErrors(vol);
-	return err;
-}
-
-/*
- * The MFT inode has special locking, so teach the lock validator
- * about this by splitting off the locking rules of the MFT from
- * the locking rules of other inodes. The MFT inode can never be
- * accessed from the VFS side (or even internally), only by the
- * map_mft functions.
- */
-static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key;
-
-/**
- * ntfs_read_inode_mount - special read_inode for mount time use only
- * @vi:		inode to read
- *
- * Read inode FILE_MFT at mount time, only called with super_block lock
- * held from within the read_super() code path.
- *
- * This function exists because when it is called the page cache for $MFT/$DATA
- * is not initialized and hence we cannot get at the contents of mft records
- * by calling map_mft_record*().
- *
- * Further it needs to cope with the circular references problem, i.e. cannot
- * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because
- * we do not know where the other extent mft records are yet and again, because
- * we cannot call map_mft_record*() yet.  Obviously this applies only when an
- * attribute list is actually present in $MFT inode.
- *
- * We solve these problems by starting with the $DATA attribute before anything
- * else and iterating using ntfs_attr_lookup($DATA) over all extents.  As each
- * extent is found, we ntfs_mapping_pairs_decompress() including the implied
- * ntfs_runlists_merge().  Each step of the iteration necessarily provides
- * sufficient information for the next step to complete.
- *
- * This should work but there are two possible pit falls (see inline comments
- * below), but only time will tell if they are real pits or just smoke...
- */
-int ntfs_read_inode_mount(struct inode *vi)
-{
-	VCN next_vcn, last_vcn, highest_vcn;
-	s64 block;
-	struct super_block *sb = vi->i_sb;
-	ntfs_volume *vol = NTFS_SB(sb);
-	struct buffer_head *bh;
-	ntfs_inode *ni;
-	MFT_RECORD *m = NULL;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	unsigned int i, nr_blocks;
-	int err;
-
-	ntfs_debug("Entering.");
-
-	/* Initialize the ntfs specific part of @vi. */
-	ntfs_init_big_inode(vi);
-
-	ni = NTFS_I(vi);
-
-	/* Setup the data attribute. It is special as it is mst protected. */
-	NInoSetNonResident(ni);
-	NInoSetMstProtected(ni);
-	NInoSetSparseDisabled(ni);
-	ni->type = AT_DATA;
-	ni->name = NULL;
-	ni->name_len = 0;
-	/*
-	 * This sets up our little cheat allowing us to reuse the async read io
-	 * completion handler for directories.
-	 */
-	ni->itype.index.block_size = vol->mft_record_size;
-	ni->itype.index.block_size_bits = vol->mft_record_size_bits;
-
-	/* Very important! Needed to be able to call map_mft_record*(). */
-	vol->mft_ino = vi;
-
-	/* Allocate enough memory to read the first mft record. */
-	if (vol->mft_record_size > 64 * 1024) {
-		ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).",
-				vol->mft_record_size);
-		goto err_out;
-	}
-	i = vol->mft_record_size;
-	if (i < sb->s_blocksize)
-		i = sb->s_blocksize;
-	m = (MFT_RECORD*)ntfs_malloc_nofs(i);
-	if (!m) {
-		ntfs_error(sb, "Failed to allocate buffer for $MFT record 0.");
-		goto err_out;
-	}
-
-	/* Determine the first block of the $MFT/$DATA attribute. */
-	block = vol->mft_lcn << vol->cluster_size_bits >>
-			sb->s_blocksize_bits;
-	nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits;
-	if (!nr_blocks)
-		nr_blocks = 1;
-
-	/* Load $MFT/$DATA's first mft record. */
-	for (i = 0; i < nr_blocks; i++) {
-		bh = sb_bread(sb, block++);
-		if (!bh) {
-			ntfs_error(sb, "Device read failed.");
-			goto err_out;
-		}
-		memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data,
-				sb->s_blocksize);
-		brelse(bh);
-	}
-
-	if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
-		ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.",
-				le32_to_cpu(m->bytes_allocated), vol->mft_record_size);
-		goto err_out;
-	}
-
-	/* Apply the mst fixups. */
-	if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
-		/* FIXME: Try to use the $MFTMirr now. */
-		ntfs_error(sb, "MST fixup failed. $MFT is corrupt.");
-		goto err_out;
-	}
-
-	/* Sanity check offset to the first attribute */
-	if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) {
-		ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.",
-			       le16_to_cpu(m->attrs_offset));
-		goto err_out;
-	}
-
-	/* Need this to sanity check attribute list references to $MFT. */
-	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-
-	/* Provides read_folio() for map_mft_record(). */
-	vi->i_mapping->a_ops = &ntfs_mst_aops;
-
-	ctx = ntfs_attr_get_search_ctx(ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-
-	/* Find the attribute list attribute if present. */
-	err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
-	if (err) {
-		if (unlikely(err != -ENOENT)) {
-			ntfs_error(sb, "Failed to lookup attribute list "
-					"attribute. You should run chkdsk.");
-			goto put_err_out;
-		}
-	} else /* if (!err) */ {
-		ATTR_LIST_ENTRY *al_entry, *next_al_entry;
-		u8 *al_end;
-		static const char *es = "  Not allowed.  $MFT is corrupt.  "
-				"You should run chkdsk.";
-
-		ntfs_debug("Attribute list attribute found in $MFT.");
-		NInoSetAttrList(ni);
-		a = ctx->attr;
-		if (a->flags & ATTR_COMPRESSION_MASK) {
-			ntfs_error(sb, "Attribute list attribute is "
-					"compressed.%s", es);
-			goto put_err_out;
-		}
-		if (a->flags & ATTR_IS_ENCRYPTED ||
-				a->flags & ATTR_IS_SPARSE) {
-			if (a->non_resident) {
-				ntfs_error(sb, "Non-resident attribute list "
-						"attribute is encrypted/"
-						"sparse.%s", es);
-				goto put_err_out;
-			}
-			ntfs_warning(sb, "Resident attribute list attribute "
-					"in $MFT system file is marked "
-					"encrypted/sparse which is not true.  "
-					"However, Windows allows this and "
-					"chkdsk does not detect or correct it "
-					"so we will just ignore the invalid "
-					"flags and pretend they are not set.");
-		}
-		/* Now allocate memory for the attribute list. */
-		ni->attr_list_size = (u32)ntfs_attr_size(a);
-		if (!ni->attr_list_size) {
-			ntfs_error(sb, "Attr_list_size is zero");
-			goto put_err_out;
-		}
-		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
-		if (!ni->attr_list) {
-			ntfs_error(sb, "Not enough memory to allocate buffer "
-					"for attribute list.");
-			goto put_err_out;
-		}
-		if (a->non_resident) {
-			NInoSetAttrListNonResident(ni);
-			if (a->data.non_resident.lowest_vcn) {
-				ntfs_error(sb, "Attribute list has non zero "
-						"lowest_vcn. $MFT is corrupt. "
-						"You should run chkdsk.");
-				goto put_err_out;
-			}
-			/* Setup the runlist. */
-			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
-					a, NULL);
-			if (IS_ERR(ni->attr_list_rl.rl)) {
-				err = PTR_ERR(ni->attr_list_rl.rl);
-				ni->attr_list_rl.rl = NULL;
-				ntfs_error(sb, "Mapping pairs decompression "
-						"failed with error code %i.",
-						-err);
-				goto put_err_out;
-			}
-			/* Now load the attribute list. */
-			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
-					ni->attr_list, ni->attr_list_size,
-					sle64_to_cpu(a->data.
-					non_resident.initialized_size)))) {
-				ntfs_error(sb, "Failed to load attribute list "
-						"attribute with error code %i.",
-						-err);
-				goto put_err_out;
-			}
-		} else /* if (!ctx.attr->non_resident) */ {
-			if ((u8*)a + le16_to_cpu(
-					a->data.resident.value_offset) +
-					le32_to_cpu(
-					a->data.resident.value_length) >
-					(u8*)ctx->mrec + vol->mft_record_size) {
-				ntfs_error(sb, "Corrupt attribute list "
-						"attribute.");
-				goto put_err_out;
-			}
-			/* Now copy the attribute list. */
-			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
-					a->data.resident.value_offset),
-					le32_to_cpu(
-					a->data.resident.value_length));
-		}
-		/* The attribute list is now setup in memory. */
-		/*
-		 * FIXME: I don't know if this case is actually possible.
-		 * According to logic it is not possible but I have seen too
-		 * many weird things in MS software to rely on logic... Thus we
-		 * perform a manual search and make sure the first $MFT/$DATA
-		 * extent is in the base inode. If it is not we abort with an
-		 * error and if we ever see a report of this error we will need
-		 * to do some magic in order to have the necessary mft record
-		 * loaded and in the right place in the page cache. But
-		 * hopefully logic will prevail and this never happens...
-		 */
-		al_entry = (ATTR_LIST_ENTRY*)ni->attr_list;
-		al_end = (u8*)al_entry + ni->attr_list_size;
-		for (;; al_entry = next_al_entry) {
-			/* Out of bounds check. */
-			if ((u8*)al_entry < ni->attr_list ||
-					(u8*)al_entry > al_end)
-				goto em_put_err_out;
-			/* Catch the end of the attribute list. */
-			if ((u8*)al_entry == al_end)
-				goto em_put_err_out;
-			if (!al_entry->length)
-				goto em_put_err_out;
-			if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
-					le16_to_cpu(al_entry->length) > al_end)
-				goto em_put_err_out;
-			next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
-					le16_to_cpu(al_entry->length));
-			if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
-				goto em_put_err_out;
-			if (AT_DATA != al_entry->type)
-				continue;
-			/* We want an unnamed attribute. */
-			if (al_entry->name_length)
-				goto em_put_err_out;
-			/* Want the first entry, i.e. lowest_vcn == 0. */
-			if (al_entry->lowest_vcn)
-				goto em_put_err_out;
-			/* First entry has to be in the base mft record. */
-			if (MREF_LE(al_entry->mft_reference) != vi->i_ino) {
-				/* MFT references do not match, logic fails. */
-				ntfs_error(sb, "BUG: The first $DATA extent "
-						"of $MFT is not in the base "
-						"mft record. Please report "
-						"you saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				goto put_err_out;
-			} else {
-				/* Sequence numbers must match. */
-				if (MSEQNO_LE(al_entry->mft_reference) !=
-						ni->seq_no)
-					goto em_put_err_out;
-				/* Got it. All is ok. We can stop now. */
-				break;
-			}
-		}
-	}
-
-	ntfs_attr_reinit_search_ctx(ctx);
-
-	/* Now load all attribute extents. */
-	a = NULL;
-	next_vcn = last_vcn = highest_vcn = 0;
-	while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
-			ctx))) {
-		runlist_element *nrl;
-
-		/* Cache the current attribute. */
-		a = ctx->attr;
-		/* $MFT must be non-resident. */
-		if (!a->non_resident) {
-			ntfs_error(sb, "$MFT must be non-resident but a "
-					"resident extent was found. $MFT is "
-					"corrupt. Run chkdsk.");
-			goto put_err_out;
-		}
-		/* $MFT must be uncompressed and unencrypted. */
-		if (a->flags & ATTR_COMPRESSION_MASK ||
-				a->flags & ATTR_IS_ENCRYPTED ||
-				a->flags & ATTR_IS_SPARSE) {
-			ntfs_error(sb, "$MFT must be uncompressed, "
-					"non-sparse, and unencrypted but a "
-					"compressed/sparse/encrypted extent "
-					"was found. $MFT is corrupt. Run "
-					"chkdsk.");
-			goto put_err_out;
-		}
-		/*
-		 * Decompress the mapping pairs array of this extent and merge
-		 * the result into the existing runlist. No need for locking
-		 * as we have exclusive access to the inode at this time and we
-		 * are a mount in progress task, too.
-		 */
-		nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
-		if (IS_ERR(nrl)) {
-			ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
-					"failed with error code %ld.  $MFT is "
-					"corrupt.", PTR_ERR(nrl));
-			goto put_err_out;
-		}
-		ni->runlist.rl = nrl;
-
-		/* Are we in the first extent? */
-		if (!next_vcn) {
-			if (a->data.non_resident.lowest_vcn) {
-				ntfs_error(sb, "First extent of $DATA "
-						"attribute has non zero "
-						"lowest_vcn. $MFT is corrupt. "
-						"You should run chkdsk.");
-				goto put_err_out;
-			}
-			/* Get the last vcn in the $DATA attribute. */
-			last_vcn = sle64_to_cpu(
-					a->data.non_resident.allocated_size)
-					>> vol->cluster_size_bits;
-			/* Fill in the inode size. */
-			vi->i_size = sle64_to_cpu(
-					a->data.non_resident.data_size);
-			ni->initialized_size = sle64_to_cpu(
-					a->data.non_resident.initialized_size);
-			ni->allocated_size = sle64_to_cpu(
-					a->data.non_resident.allocated_size);
-			/*
-			 * Verify the number of mft records does not exceed
-			 * 2^32 - 1.
-			 */
-			if ((vi->i_size >> vol->mft_record_size_bits) >=
-					(1ULL << 32)) {
-				ntfs_error(sb, "$MFT is too big! Aborting.");
-				goto put_err_out;
-			}
-			/*
-			 * We have got the first extent of the runlist for
-			 * $MFT which means it is now relatively safe to call
-			 * the normal ntfs_read_inode() function.
-			 * Complete reading the inode, this will actually
-			 * re-read the mft record for $MFT, this time entering
-			 * it into the page cache with which we complete the
-			 * kick start of the volume. It should be safe to do
-			 * this now as the first extent of $MFT/$DATA is
-			 * already known and we would hope that we don't need
-			 * further extents in order to find the other
-			 * attributes belonging to $MFT. Only time will tell if
-			 * this is really the case. If not we will have to play
-			 * magic at this point, possibly duplicating a lot of
-			 * ntfs_read_inode() at this point. We will need to
-			 * ensure we do enough of its work to be able to call
-			 * ntfs_read_inode() on extents of $MFT/$DATA. But lets
-			 * hope this never happens...
-			 */
-			ntfs_read_locked_inode(vi);
-			if (is_bad_inode(vi)) {
-				ntfs_error(sb, "ntfs_read_inode() of $MFT "
-						"failed. BUG or corrupt $MFT. "
-						"Run chkdsk and if no errors "
-						"are found, please report you "
-						"saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				ntfs_attr_put_search_ctx(ctx);
-				/* Revert to the safe super operations. */
-				ntfs_free(m);
-				return -1;
-			}
-			/*
-			 * Re-initialize some specifics about $MFT's inode as
-			 * ntfs_read_inode() will have set up the default ones.
-			 */
-			/* Set uid and gid to root. */
-			vi->i_uid = GLOBAL_ROOT_UID;
-			vi->i_gid = GLOBAL_ROOT_GID;
-			/* Regular file. No access for anyone. */
-			vi->i_mode = S_IFREG;
-			/* No VFS initiated operations allowed for $MFT. */
-			vi->i_op = &ntfs_empty_inode_ops;
-			vi->i_fop = &ntfs_empty_file_ops;
-		}
-
-		/* Get the lowest vcn for the next extent. */
-		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
-		next_vcn = highest_vcn + 1;
-
-		/* Only one extent or error, which we catch below. */
-		if (next_vcn <= 0)
-			break;
-
-		/* Avoid endless loops due to corruption. */
-		if (next_vcn < sle64_to_cpu(
-				a->data.non_resident.lowest_vcn)) {
-			ntfs_error(sb, "$MFT has corrupt attribute list "
-					"attribute. Run chkdsk.");
-			goto put_err_out;
-		}
-	}
-	if (err != -ENOENT) {
-		ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. "
-				"$MFT is corrupt. Run chkdsk.");
-		goto put_err_out;
-	}
-	if (!a) {
-		ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
-				"corrupt. Run chkdsk.");
-		goto put_err_out;
-	}
-	if (highest_vcn && highest_vcn != last_vcn - 1) {
-		ntfs_error(sb, "Failed to load the complete runlist for "
-				"$MFT/$DATA. Driver bug or corrupt $MFT. "
-				"Run chkdsk.");
-		ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx",
-				(unsigned long long)highest_vcn,
-				(unsigned long long)last_vcn - 1);
-		goto put_err_out;
-	}
-	ntfs_attr_put_search_ctx(ctx);
-	ntfs_debug("Done.");
-	ntfs_free(m);
-
-	/*
-	 * Split the locking rules of the MFT inode from the
-	 * locking rules of other inodes:
-	 */
-	lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key);
-	lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key);
-
-	return 0;
-
-em_put_err_out:
-	ntfs_error(sb, "Couldn't find first extent of $DATA attribute in "
-			"attribute list. $MFT is corrupt. Run chkdsk.");
-put_err_out:
-	ntfs_attr_put_search_ctx(ctx);
-err_out:
-	ntfs_error(sb, "Failed. Marking inode as bad.");
-	make_bad_inode(vi);
-	ntfs_free(m);
-	return -1;
-}
-
-static void __ntfs_clear_inode(ntfs_inode *ni)
-{
-	/* Free all alocated memory. */
-	down_write(&ni->runlist.lock);
-	if (ni->runlist.rl) {
-		ntfs_free(ni->runlist.rl);
-		ni->runlist.rl = NULL;
-	}
-	up_write(&ni->runlist.lock);
-
-	if (ni->attr_list) {
-		ntfs_free(ni->attr_list);
-		ni->attr_list = NULL;
-	}
-
-	down_write(&ni->attr_list_rl.lock);
-	if (ni->attr_list_rl.rl) {
-		ntfs_free(ni->attr_list_rl.rl);
-		ni->attr_list_rl.rl = NULL;
-	}
-	up_write(&ni->attr_list_rl.lock);
-
-	if (ni->name_len && ni->name != I30) {
-		/* Catch bugs... */
-		BUG_ON(!ni->name);
-		kfree(ni->name);
-	}
-}
-
-void ntfs_clear_extent_inode(ntfs_inode *ni)
-{
-	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
-
-	BUG_ON(NInoAttr(ni));
-	BUG_ON(ni->nr_extents != -1);
-
-#ifdef NTFS_RW
-	if (NInoDirty(ni)) {
-		if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino)))
-			ntfs_error(ni->vol->sb, "Clearing dirty extent inode!  "
-					"Losing data!  This is a BUG!!!");
-		// FIXME:  Do something!!!
-	}
-#endif /* NTFS_RW */
-
-	__ntfs_clear_inode(ni);
-
-	/* Bye, bye... */
-	ntfs_destroy_extent_inode(ni);
-}
-
-/**
- * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
- * @vi:		vfs inode pending annihilation
- *
- * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
- * is called, which deallocates all memory belonging to the NTFS specific part
- * of the inode and returns.
- *
- * If the MFT record is dirty, we commit it before doing anything else.
- */
-void ntfs_evict_big_inode(struct inode *vi)
-{
-	ntfs_inode *ni = NTFS_I(vi);
-
-	truncate_inode_pages_final(&vi->i_data);
-	clear_inode(vi);
-
-#ifdef NTFS_RW
-	if (NInoDirty(ni)) {
-		bool was_bad = (is_bad_inode(vi));
-
-		/* Committing the inode also commits all extent inodes. */
-		ntfs_commit_inode(vi);
-
-		if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) {
-			ntfs_error(vi->i_sb, "Failed to commit dirty inode "
-					"0x%lx.  Losing data!", vi->i_ino);
-			// FIXME:  Do something!!!
-		}
-	}
-#endif /* NTFS_RW */
-
-	/* No need to lock at this stage as no one else has a reference. */
-	if (ni->nr_extents > 0) {
-		int i;
-
-		for (i = 0; i < ni->nr_extents; i++)
-			ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]);
-		kfree(ni->ext.extent_ntfs_inos);
-	}
-
-	__ntfs_clear_inode(ni);
-
-	if (NInoAttr(ni)) {
-		/* Release the base inode if we are holding it. */
-		if (ni->nr_extents == -1) {
-			iput(VFS_I(ni->ext.base_ntfs_ino));
-			ni->nr_extents = 0;
-			ni->ext.base_ntfs_ino = NULL;
-		}
-	}
-	BUG_ON(ni->page);
-	if (!atomic_dec_and_test(&ni->count))
-		BUG();
-	return;
-}
-
-/**
- * ntfs_show_options - show mount options in /proc/mounts
- * @sf:		seq_file in which to write our mount options
- * @root:	root of the mounted tree whose mount options to display
- *
- * Called by the VFS once for each mounted ntfs volume when someone reads
- * /proc/mounts in order to display the NTFS specific mount options of each
- * mount. The mount options of fs specified by @root are written to the seq file
- * @sf and success is returned.
- */
-int ntfs_show_options(struct seq_file *sf, struct dentry *root)
-{
-	ntfs_volume *vol = NTFS_SB(root->d_sb);
-	int i;
-
-	seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid));
-	seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid));
-	if (vol->fmask == vol->dmask)
-		seq_printf(sf, ",umask=0%o", vol->fmask);
-	else {
-		seq_printf(sf, ",fmask=0%o", vol->fmask);
-		seq_printf(sf, ",dmask=0%o", vol->dmask);
-	}
-	seq_printf(sf, ",nls=%s", vol->nls_map->charset);
-	if (NVolCaseSensitive(vol))
-		seq_printf(sf, ",case_sensitive");
-	if (NVolShowSystemFiles(vol))
-		seq_printf(sf, ",show_sys_files");
-	if (!NVolSparseEnabled(vol))
-		seq_printf(sf, ",disable_sparse");
-	for (i = 0; on_errors_arr[i].val; i++) {
-		if (on_errors_arr[i].val & vol->on_errors)
-			seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
-	}
-	seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier);
-	return 0;
-}
-
-#ifdef NTFS_RW
-
-static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
-		"chkdsk.";
-
-/**
- * ntfs_truncate - called when the i_size of an ntfs inode is changed
- * @vi:		inode for which the i_size was changed
- *
- * We only support i_size changes for normal files at present, i.e. not
- * compressed and not encrypted.  This is enforced in ntfs_setattr(), see
- * below.
- *
- * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
- * that the change is allowed.
- *
- * This implies for us that @vi is a file inode rather than a directory, index,
- * or attribute inode as well as that @vi is a base inode.
- *
- * Returns 0 on success or -errno on error.
- *
- * Called with ->i_mutex held.
- */
-int ntfs_truncate(struct inode *vi)
-{
-	s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
-	VCN highest_vcn;
-	unsigned long flags;
-	ntfs_inode *base_ni, *ni = NTFS_I(vi);
-	ntfs_volume *vol = ni->vol;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	const char *te = "  Leaving file length out of sync with i_size.";
-	int err, mp_size, size_change, alloc_change;
-
-	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-	BUG_ON(NInoAttr(ni));
-	BUG_ON(S_ISDIR(vi->i_mode));
-	BUG_ON(NInoMstProtected(ni));
-	BUG_ON(ni->nr_extents < 0);
-retry_truncate:
-	/*
-	 * Lock the runlist for writing and map the mft record to ensure it is
-	 * safe to mess with the attribute runlist and sizes.
-	 */
-	down_write(&ni->runlist.lock);
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
-				"(error code %d).%s", vi->i_ino, err, te);
-		ctx = NULL;
-		m = NULL;
-		goto old_bad_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		ntfs_error(vi->i_sb, "Failed to allocate a search context for "
-				"inode 0x%lx (not enough memory).%s",
-				vi->i_ino, te);
-		err = -ENOMEM;
-		goto old_bad_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			ntfs_error(vi->i_sb, "Open attribute is missing from "
-					"mft record.  Inode 0x%lx is corrupt.  "
-					"Run chkdsk.%s", vi->i_ino, te);
-			err = -EIO;
-		} else
-			ntfs_error(vi->i_sb, "Failed to lookup attribute in "
-					"inode 0x%lx (error code %d).%s",
-					vi->i_ino, err, te);
-		goto old_bad_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	/*
-	 * The i_size of the vfs inode is the new size for the attribute value.
-	 */
-	new_size = i_size_read(vi);
-	/* The current size of the attribute value is the old size. */
-	old_size = ntfs_attr_size(a);
-	/* Calculate the new allocated size. */
-	if (NInoNonResident(ni))
-		new_alloc_size = (new_size + vol->cluster_size - 1) &
-				~(s64)vol->cluster_size_mask;
-	else
-		new_alloc_size = (new_size + 7) & ~7;
-	/* The current allocated size is the old allocated size. */
-	read_lock_irqsave(&ni->size_lock, flags);
-	old_alloc_size = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * The change in the file size.  This will be 0 if no change, >0 if the
-	 * size is growing, and <0 if the size is shrinking.
-	 */
-	size_change = -1;
-	if (new_size - old_size >= 0) {
-		size_change = 1;
-		if (new_size == old_size)
-			size_change = 0;
-	}
-	/* As above for the allocated size. */
-	alloc_change = -1;
-	if (new_alloc_size - old_alloc_size >= 0) {
-		alloc_change = 1;
-		if (new_alloc_size == old_alloc_size)
-			alloc_change = 0;
-	}
-	/*
-	 * If neither the size nor the allocation are being changed there is
-	 * nothing to do.
-	 */
-	if (!size_change && !alloc_change)
-		goto unm_done;
-	/* If the size is changing, check if new size is allowed in $AttrDef. */
-	if (size_change) {
-		err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
-		if (unlikely(err)) {
-			if (err == -ERANGE) {
-				ntfs_error(vol->sb, "Truncate would cause the "
-						"inode 0x%lx to %simum size "
-						"for its attribute type "
-						"(0x%x).  Aborting truncate.",
-						vi->i_ino,
-						new_size > old_size ? "exceed "
-						"the max" : "go under the min",
-						le32_to_cpu(ni->type));
-				err = -EFBIG;
-			} else {
-				ntfs_error(vol->sb, "Inode 0x%lx has unknown "
-						"attribute type 0x%x.  "
-						"Aborting truncate.",
-						vi->i_ino,
-						le32_to_cpu(ni->type));
-				err = -EIO;
-			}
-			/* Reset the vfs inode size to the old size. */
-			i_size_write(vi, old_size);
-			goto err_out;
-		}
-	}
-	if (NInoCompressed(ni) || NInoEncrypted(ni)) {
-		ntfs_warning(vi->i_sb, "Changes in inode size are not "
-				"supported yet for %s files, ignoring.",
-				NInoCompressed(ni) ? "compressed" :
-				"encrypted");
-		err = -EOPNOTSUPP;
-		goto bad_out;
-	}
-	if (a->non_resident)
-		goto do_non_resident_truncate;
-	BUG_ON(NInoNonResident(ni));
-	/* Resize the attribute record to best fit the new attribute size. */
-	if (new_size < vol->mft_record_size &&
-			!ntfs_resident_attr_value_resize(m, a, new_size)) {
-		/* The resize succeeded! */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		write_lock_irqsave(&ni->size_lock, flags);
-		/* Update the sizes in the ntfs inode and all is done. */
-		ni->allocated_size = le32_to_cpu(a->length) -
-				le16_to_cpu(a->data.resident.value_offset);
-		/*
-		 * Note ntfs_resident_attr_value_resize() has already done any
-		 * necessary data clearing in the attribute record.  When the
-		 * file is being shrunk vmtruncate() will already have cleared
-		 * the top part of the last partial page, i.e. since this is
-		 * the resident case this is the page with index 0.  However,
-		 * when the file is being expanded, the page cache page data
-		 * between the old data_size, i.e. old_size, and the new_size
-		 * has not been zeroed.  Fortunately, we do not need to zero it
-		 * either since on one hand it will either already be zero due
-		 * to both read_folio and writepage clearing partial page data
-		 * beyond i_size in which case there is nothing to do or in the
-		 * case of the file being mmap()ped at the same time, POSIX
-		 * specifies that the behaviour is unspecified thus we do not
-		 * have to do anything.  This means that in our implementation
-		 * in the rare case that the file is mmap()ped and a write
-		 * occurred into the mmap()ped region just beyond the file size
-		 * and writepage has not yet been called to write out the page
-		 * (which would clear the area beyond the file size) and we now
-		 * extend the file size to incorporate this dirty region
-		 * outside the file size, a write of the page would result in
-		 * this data being written to disk instead of being cleared.
-		 * Given both POSIX and the Linux mmap(2) man page specify that
-		 * this corner case is undefined, we choose to leave it like
-		 * that as this is much simpler for us as we cannot lock the
-		 * relevant page now since we are holding too many ntfs locks
-		 * which would result in a lock reversal deadlock.
-		 */
-		ni->initialized_size = new_size;
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		goto unm_done;
-	}
-	/* If the above resize failed, this must be an attribute extension. */
-	BUG_ON(size_change < 0);
-	/*
-	 * We have to drop all the locks so we can call
-	 * ntfs_attr_make_non_resident().  This could be optimised by try-
-	 * locking the first page cache page and only if that fails dropping
-	 * the locks, locking the page, and redoing all the locking and
-	 * lookups.  While this would be a huge optimisation, it is not worth
-	 * it as this is definitely a slow code path as it only ever can happen
-	 * once for any given file.
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-	/*
-	 * Not enough space in the mft record, try to make the attribute
-	 * non-resident and if successful restart the truncation process.
-	 */
-	err = ntfs_attr_make_non_resident(ni, old_size);
-	if (likely(!err))
-		goto retry_truncate;
-	/*
-	 * Could not make non-resident.  If this is due to this not being
-	 * permitted for this attribute type or there not being enough space,
-	 * try to make other attributes non-resident.  Otherwise fail.
-	 */
-	if (unlikely(err != -EPERM && err != -ENOSPC)) {
-		ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
-				"type 0x%x, because the conversion from "
-				"resident to non-resident attribute failed "
-				"with error code %i.", vi->i_ino,
-				(unsigned)le32_to_cpu(ni->type), err);
-		if (err != -ENOMEM)
-			err = -EIO;
-		goto conv_err_out;
-	}
-	/* TODO: Not implemented from here, abort. */
-	if (err == -ENOSPC)
-		ntfs_error(vol->sb, "Not enough space in the mft record/on "
-				"disk for the non-resident attribute value.  "
-				"This case is not implemented yet.");
-	else /* if (err == -EPERM) */
-		ntfs_error(vol->sb, "This attribute type may not be "
-				"non-resident.  This case is not implemented "
-				"yet.");
-	err = -EOPNOTSUPP;
-	goto conv_err_out;
-#if 0
-	// TODO: Attempt to make other attributes non-resident.
-	if (!err)
-		goto do_resident_extend;
-	/*
-	 * Both the attribute list attribute and the standard information
-	 * attribute must remain in the base inode.  Thus, if this is one of
-	 * these attributes, we have to try to move other attributes out into
-	 * extent mft records instead.
-	 */
-	if (ni->type == AT_ATTRIBUTE_LIST ||
-			ni->type == AT_STANDARD_INFORMATION) {
-		// TODO: Attempt to move other attributes into extent mft
-		// records.
-		err = -EOPNOTSUPP;
-		if (!err)
-			goto do_resident_extend;
-		goto err_out;
-	}
-	// TODO: Attempt to move this attribute to an extent mft record, but
-	// only if it is not already the only attribute in an mft record in
-	// which case there would be nothing to gain.
-	err = -EOPNOTSUPP;
-	if (!err)
-		goto do_resident_extend;
-	/* There is nothing we can do to make enough space. )-: */
-	goto err_out;
-#endif
-do_non_resident_truncate:
-	BUG_ON(!NInoNonResident(ni));
-	if (alloc_change < 0) {
-		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
-		if (highest_vcn > 0 &&
-				old_alloc_size >> vol->cluster_size_bits >
-				highest_vcn + 1) {
-			/*
-			 * This attribute has multiple extents.  Not yet
-			 * supported.
-			 */
-			ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
-					"attribute type 0x%x, because the "
-					"attribute is highly fragmented (it "
-					"consists of multiple extents) and "
-					"this case is not implemented yet.",
-					vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type));
-			err = -EOPNOTSUPP;
-			goto bad_out;
-		}
-	}
-	/*
-	 * If the size is shrinking, need to reduce the initialized_size and
-	 * the data_size before reducing the allocation.
-	 */
-	if (size_change < 0) {
-		/*
-		 * Make the valid size smaller (i_size is already up-to-date).
-		 */
-		write_lock_irqsave(&ni->size_lock, flags);
-		if (new_size < ni->initialized_size) {
-			ni->initialized_size = new_size;
-			a->data.non_resident.initialized_size =
-					cpu_to_sle64(new_size);
-		}
-		a->data.non_resident.data_size = cpu_to_sle64(new_size);
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		/* If the allocated size is not changing, we are done. */
-		if (!alloc_change)
-			goto unm_done;
-		/*
-		 * If the size is shrinking it makes no sense for the
-		 * allocation to be growing.
-		 */
-		BUG_ON(alloc_change > 0);
-	} else /* if (size_change >= 0) */ {
-		/*
-		 * The file size is growing or staying the same but the
-		 * allocation can be shrinking, growing or staying the same.
-		 */
-		if (alloc_change > 0) {
-			/*
-			 * We need to extend the allocation and possibly update
-			 * the data size.  If we are updating the data size,
-			 * since we are not touching the initialized_size we do
-			 * not need to worry about the actual data on disk.
-			 * And as far as the page cache is concerned, there
-			 * will be no pages beyond the old data size and any
-			 * partial region in the last page between the old and
-			 * new data size (or the end of the page if the new
-			 * data size is outside the page) does not need to be
-			 * modified as explained above for the resident
-			 * attribute truncate case.  To do this, we simply drop
-			 * the locks we hold and leave all the work to our
-			 * friendly helper ntfs_attr_extend_allocation().
-			 */
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(base_ni);
-			up_write(&ni->runlist.lock);
-			err = ntfs_attr_extend_allocation(ni, new_size,
-					size_change > 0 ? new_size : -1, -1);
-			/*
-			 * ntfs_attr_extend_allocation() will have done error
-			 * output already.
-			 */
-			goto done;
-		}
-		if (!alloc_change)
-			goto alloc_done;
-	}
-	/* alloc_change < 0 */
-	/* Free the clusters. */
-	nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
-			vol->cluster_size_bits, -1, ctx);
-	m = ctx->mrec;
-	a = ctx->attr;
-	if (unlikely(nr_freed < 0)) {
-		ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
-				"%lli).  Unmount and run chkdsk to recover "
-				"the lost cluster(s).", (long long)nr_freed);
-		NVolSetErrors(vol);
-		nr_freed = 0;
-	}
-	/* Truncate the runlist. */
-	err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
-			new_alloc_size >> vol->cluster_size_bits);
-	/*
-	 * If the runlist truncation failed and/or the search context is no
-	 * longer valid, we cannot resize the attribute record or build the
-	 * mapping pairs array thus we mark the inode bad so that no access to
-	 * the freed clusters can happen.
-	 */
-	if (unlikely(err || IS_ERR(m))) {
-		ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
-				IS_ERR(m) ?
-				"restore attribute search context" :
-				"truncate attribute runlist",
-				IS_ERR(m) ? PTR_ERR(m) : err, es);
-		err = -EIO;
-		goto bad_out;
-	}
-	/* Get the size for the shrunk mapping pairs array for the runlist. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
-	if (unlikely(mp_size <= 0)) {
-		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
-				"attribute type 0x%x, because determining the "
-				"size for the mapping pairs failed with error "
-				"code %i.%s", vi->i_ino,
-				(unsigned)le32_to_cpu(ni->type), mp_size, es);
-		err = -EIO;
-		goto bad_out;
-	}
-	/*
-	 * Shrink the attribute record for the new mapping pairs array.  Note,
-	 * this cannot fail since we are making the attribute smaller thus by
-	 * definition there is enough space to do so.
-	 */
-	err = ntfs_attr_record_resize(m, a, mp_size +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
-	BUG_ON(err);
-	/*
-	 * Generate the mapping pairs array directly into the attribute record.
-	 */
-	err = ntfs_mapping_pairs_build(vol, (u8*)a +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, ni->runlist.rl, 0, -1, NULL);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
-				"attribute type 0x%x, because building the "
-				"mapping pairs failed with error code %i.%s",
-				vi->i_ino, (unsigned)le32_to_cpu(ni->type),
-				err, es);
-		err = -EIO;
-		goto bad_out;
-	}
-	/* Update the allocated/compressed size as well as the highest vcn. */
-	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
-			vol->cluster_size_bits) - 1);
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->allocated_size = new_alloc_size;
-	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
-	if (NInoSparse(ni) || NInoCompressed(ni)) {
-		if (nr_freed) {
-			ni->itype.compressed.size -= nr_freed <<
-					vol->cluster_size_bits;
-			BUG_ON(ni->itype.compressed.size < 0);
-			a->data.non_resident.compressed_size = cpu_to_sle64(
-					ni->itype.compressed.size);
-			vi->i_blocks = ni->itype.compressed.size >> 9;
-		}
-	} else
-		vi->i_blocks = new_alloc_size >> 9;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * We have shrunk the allocation.  If this is a shrinking truncate we
-	 * have already dealt with the initialized_size and the data_size above
-	 * and we are done.  If the truncate is only changing the allocation
-	 * and not the data_size, we are also done.  If this is an extending
-	 * truncate, need to extend the data_size now which is ensured by the
-	 * fact that @size_change is positive.
-	 */
-alloc_done:
-	/*
-	 * If the size is growing, need to update it now.  If it is shrinking,
-	 * we have already updated it above (before the allocation change).
-	 */
-	if (size_change > 0)
-		a->data.non_resident.data_size = cpu_to_sle64(new_size);
-	/* Ensure the modified mft record is written out. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-unm_done:
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-done:
-	/* Update the mtime and ctime on the base inode. */
-	/* normally ->truncate shouldn't update ctime or mtime,
-	 * but ntfs did before so it got a copy & paste version
-	 * of file_update_time.  one day someone should fix this
-	 * for real.
-	 */
-	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
-		struct timespec64 now = current_time(VFS_I(base_ni));
-		struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
-		struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni));
-		int sync_it = 0;
-
-		if (!timespec64_equal(&mtime, &now) ||
-		    !timespec64_equal(&ctime, &now))
-			sync_it = 1;
-		inode_set_ctime_to_ts(VFS_I(base_ni), now);
-		inode_set_mtime_to_ts(VFS_I(base_ni), now);
-
-		if (sync_it)
-			mark_inode_dirty_sync(VFS_I(base_ni));
-	}
-
-	if (likely(!err)) {
-		NInoClearTruncateFailed(ni);
-		ntfs_debug("Done.");
-	}
-	return err;
-old_bad_out:
-	old_size = -1;
-bad_out:
-	if (err != -ENOMEM && err != -EOPNOTSUPP)
-		NVolSetErrors(vol);
-	if (err != -EOPNOTSUPP)
-		NInoSetTruncateFailed(ni);
-	else if (old_size >= 0)
-		i_size_write(vi, old_size);
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-out:
-	ntfs_debug("Failed.  Returning error code %i.", err);
-	return err;
-conv_err_out:
-	if (err != -ENOMEM && err != -EOPNOTSUPP)
-		NVolSetErrors(vol);
-	if (err != -EOPNOTSUPP)
-		NInoSetTruncateFailed(ni);
-	else
-		i_size_write(vi, old_size);
-	goto out;
-}
-
-/**
- * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value
- * @vi:		inode for which the i_size was changed
- *
- * Wrapper for ntfs_truncate() that has no return value.
- *
- * See ntfs_truncate() description above for details.
- */
-#ifdef NTFS_RW
-void ntfs_truncate_vfs(struct inode *vi) {
-	ntfs_truncate(vi);
-}
-#endif
-
-/**
- * ntfs_setattr - called from notify_change() when an attribute is being changed
- * @idmap:	idmap of the mount the inode was found from
- * @dentry:	dentry whose attributes to change
- * @attr:	structure describing the attributes and the changes
- *
- * We have to trap VFS attempts to truncate the file described by @dentry as
- * soon as possible, because we do not implement changes in i_size yet.  So we
- * abort all i_size changes here.
- *
- * We also abort all changes of user, group, and mode as we do not implement
- * the NTFS ACLs yet.
- *
- * Called with ->i_mutex held.
- */
-int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
-		 struct iattr *attr)
-{
-	struct inode *vi = d_inode(dentry);
-	int err;
-	unsigned int ia_valid = attr->ia_valid;
-
-	err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
-	if (err)
-		goto out;
-	/* We do not support NTFS ACLs yet. */
-	if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
-		ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
-				"supported yet, ignoring.");
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-	if (ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(vi)) {
-			ntfs_inode *ni = NTFS_I(vi);
-			/*
-			 * FIXME: For now we do not support resizing of
-			 * compressed or encrypted files yet.
-			 */
-			if (NInoCompressed(ni) || NInoEncrypted(ni)) {
-				ntfs_warning(vi->i_sb, "Changes in inode size "
-						"are not supported yet for "
-						"%s files, ignoring.",
-						NInoCompressed(ni) ?
-						"compressed" : "encrypted");
-				err = -EOPNOTSUPP;
-			} else {
-				truncate_setsize(vi, attr->ia_size);
-				ntfs_truncate_vfs(vi);
-			}
-			if (err || ia_valid == ATTR_SIZE)
-				goto out;
-		} else {
-			/*
-			 * We skipped the truncate but must still update
-			 * timestamps.
-			 */
-			ia_valid |= ATTR_MTIME | ATTR_CTIME;
-		}
-	}
-	if (ia_valid & ATTR_ATIME)
-		inode_set_atime_to_ts(vi, attr->ia_atime);
-	if (ia_valid & ATTR_MTIME)
-		inode_set_mtime_to_ts(vi, attr->ia_mtime);
-	if (ia_valid & ATTR_CTIME)
-		inode_set_ctime_to_ts(vi, attr->ia_ctime);
-	mark_inode_dirty(vi);
-out:
-	return err;
-}
-
-/**
- * __ntfs_write_inode - write out a dirty inode
- * @vi:		inode to write out
- * @sync:	if true, write out synchronously
- *
- * Write out a dirty inode to disk including any extent inodes if present.
- *
- * If @sync is true, commit the inode to disk and wait for io completion.  This
- * is done using write_mft_record().
- *
- * If @sync is false, just schedule the write to happen but do not wait for i/o
- * completion.  In 2.6 kernels, scheduling usually happens just by virtue of
- * marking the page (and in this case mft record) dirty but we do not implement
- * this yet as write_mft_record() largely ignores the @sync parameter and
- * always performs synchronous writes.
- *
- * Return 0 on success and -errno on error.
- */
-int __ntfs_write_inode(struct inode *vi, int sync)
-{
-	sle64 nt;
-	ntfs_inode *ni = NTFS_I(vi);
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	STANDARD_INFORMATION *si;
-	int err = 0;
-	bool modified = false;
-
-	ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
-			vi->i_ino);
-	/*
-	 * Dirty attribute inodes are written via their real inodes so just
-	 * clean them here.  Access time updates are taken care off when the
-	 * real inode is written.
-	 */
-	if (NInoAttr(ni)) {
-		NInoClearDirty(ni);
-		ntfs_debug("Done.");
-		return 0;
-	}
-	/* Map, pin, and lock the mft record belonging to the inode. */
-	m = map_mft_record(ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	/* Update the access times in the standard information attribute. */
-	ctx = ntfs_attr_get_search_ctx(ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-	err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		ntfs_attr_put_search_ctx(ctx);
-		goto unm_err_out;
-	}
-	si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	/* Update the access times if they have changed. */
-	nt = utc2ntfs(inode_get_mtime(vi));
-	if (si->last_data_change_time != nt) {
-		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino, (long long)
-				sle64_to_cpu(si->last_data_change_time),
-				(long long)sle64_to_cpu(nt));
-		si->last_data_change_time = nt;
-		modified = true;
-	}
-	nt = utc2ntfs(inode_get_ctime(vi));
-	if (si->last_mft_change_time != nt) {
-		ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino, (long long)
-				sle64_to_cpu(si->last_mft_change_time),
-				(long long)sle64_to_cpu(nt));
-		si->last_mft_change_time = nt;
-		modified = true;
-	}
-	nt = utc2ntfs(inode_get_atime(vi));
-	if (si->last_access_time != nt) {
-		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino,
-				(long long)sle64_to_cpu(si->last_access_time),
-				(long long)sle64_to_cpu(nt));
-		si->last_access_time = nt;
-		modified = true;
-	}
-	/*
-	 * If we just modified the standard information attribute we need to
-	 * mark the mft record it is in dirty.  We do this manually so that
-	 * mark_inode_dirty() is not called which would redirty the inode and
-	 * hence result in an infinite loop of trying to write the inode.
-	 * There is no need to mark the base inode nor the base mft record
-	 * dirty, since we are going to write this mft record below in any case
-	 * and the base mft record may actually not have been modified so it
-	 * might not need to be written out.
-	 * NOTE: It is not a problem when the inode for $MFT itself is being
-	 * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
-	 * on the $MFT inode and hence __ntfs_write_inode() will not be
-	 * re-invoked because of it which in turn is ok since the dirtied mft
-	 * record will be cleaned and written out to disk below, i.e. before
-	 * this function returns.
-	 */
-	if (modified) {
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		if (!NInoTestSetDirty(ctx->ntfs_ino))
-			mark_ntfs_record_dirty(ctx->ntfs_ino->page,
-					ctx->ntfs_ino->page_ofs);
-	}
-	ntfs_attr_put_search_ctx(ctx);
-	/* Now the access times are updated, write the base mft record. */
-	if (NInoDirty(ni))
-		err = write_mft_record(ni, m, sync);
-	/* Write all attached extent mft records. */
-	mutex_lock(&ni->extent_lock);
-	if (ni->nr_extents > 0) {
-		ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
-		int i;
-
-		ntfs_debug("Writing %i extent inodes.", ni->nr_extents);
-		for (i = 0; i < ni->nr_extents; i++) {
-			ntfs_inode *tni = extent_nis[i];
-
-			if (NInoDirty(tni)) {
-				MFT_RECORD *tm = map_mft_record(tni);
-				int ret;
-
-				if (IS_ERR(tm)) {
-					if (!err || err == -ENOMEM)
-						err = PTR_ERR(tm);
-					continue;
-				}
-				ret = write_mft_record(tni, tm, sync);
-				unmap_mft_record(tni);
-				if (unlikely(ret)) {
-					if (!err || err == -ENOMEM)
-						err = ret;
-				}
-			}
-		}
-	}
-	mutex_unlock(&ni->extent_lock);
-	unmap_mft_record(ni);
-	if (unlikely(err))
-		goto err_out;
-	ntfs_debug("Done.");
-	return 0;
-unm_err_out:
-	unmap_mft_record(ni);
-err_out:
-	if (err == -ENOMEM) {
-		ntfs_warning(vi->i_sb, "Not enough memory to write inode.  "
-				"Marking the inode dirty again, so the VFS "
-				"retries later.");
-		mark_inode_dirty(vi);
-	} else {
-		ntfs_error(vi->i_sb, "Failed (error %i):  Run chkdsk.", -err);
-		NVolSetErrors(ni->vol);
-	}
-	return err;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
deleted file mode 100644
index 147ef4ddb691..000000000000
--- a/fs/ntfs/inode.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
- *	     the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_INODE_H
-#define _LINUX_NTFS_INODE_H
-
-#include <linux/atomic.h>
-
-#include <linux/fs.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/mutex.h>
-#include <linux/seq_file.h>
-
-#include "layout.h"
-#include "volume.h"
-#include "types.h"
-#include "runlist.h"
-#include "debug.h"
-
-typedef struct _ntfs_inode ntfs_inode;
-
-/*
- * The NTFS in-memory inode structure. It is just used as an extension to the
- * fields already provided in the VFS inode.
- */
-struct _ntfs_inode {
-	rwlock_t size_lock;	/* Lock serializing access to inode sizes. */
-	s64 initialized_size;	/* Copy from the attribute record. */
-	s64 allocated_size;	/* Copy from the attribute record. */
-	unsigned long state;	/* NTFS specific flags describing this inode.
-				   See ntfs_inode_state_bits below. */
-	unsigned long mft_no;	/* Number of the mft record / inode. */
-	u16 seq_no;		/* Sequence number of the mft record. */
-	atomic_t count;		/* Inode reference count for book keeping. */
-	ntfs_volume *vol;	/* Pointer to the ntfs volume of this inode. */
-	/*
-	 * If NInoAttr() is true, the below fields describe the attribute which
-	 * this fake inode belongs to. The actual inode of this attribute is
-	 * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see
-	 * below). For real inodes, we also set the type (AT_DATA for files and
-	 * AT_INDEX_ALLOCATION for directories), with the name = NULL and
-	 * name_len = 0 for files and name = I30 (global constant) and
-	 * name_len = 4 for directories.
-	 */
-	ATTR_TYPE type;	/* Attribute type of this fake inode. */
-	ntfschar *name;		/* Attribute name of this fake inode. */
-	u32 name_len;		/* Attribute name length of this fake inode. */
-	runlist runlist;	/* If state has the NI_NonResident bit set,
-				   the runlist of the unnamed data attribute
-				   (if a file) or of the index allocation
-				   attribute (directory) or of the attribute
-				   described by the fake inode (if NInoAttr()).
-				   If runlist.rl is NULL, the runlist has not
-				   been read in yet or has been unmapped. If
-				   NI_NonResident is clear, the attribute is
-				   resident (file and fake inode) or there is
-				   no $I30 index allocation attribute
-				   (small directory). In the latter case
-				   runlist.rl is always NULL.*/
-	/*
-	 * The following fields are only valid for real inodes and extent
-	 * inodes.
-	 */
-	struct mutex mrec_lock;	/* Lock for serializing access to the
-				   mft record belonging to this inode. */
-	struct page *page;	/* The page containing the mft record of the
-				   inode. This should only be touched by the
-				   (un)map_mft_record*() functions. */
-	int page_ofs;		/* Offset into the page at which the mft record
-				   begins. This should only be touched by the
-				   (un)map_mft_record*() functions. */
-	/*
-	 * Attribute list support (only for use by the attribute lookup
-	 * functions). Setup during read_inode for all inodes with attribute
-	 * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is
-	 * further only valid if NI_AttrListNonResident is set.
-	 */
-	u32 attr_list_size;	/* Length of attribute list value in bytes. */
-	u8 *attr_list;		/* Attribute list value itself. */
-	runlist attr_list_rl;	/* Run list for the attribute list value. */
-	union {
-		struct { /* It is a directory, $MFT, or an index inode. */
-			u32 block_size;		/* Size of an index block. */
-			u32 vcn_size;		/* Size of a vcn in this
-						   index. */
-			COLLATION_RULE collation_rule; /* The collation rule
-						   for the index. */
-			u8 block_size_bits; 	/* Log2 of the above. */
-			u8 vcn_size_bits;	/* Log2 of the above. */
-		} index;
-		struct { /* It is a compressed/sparse file/attribute inode. */
-			s64 size;		/* Copy of compressed_size from
-						   $DATA. */
-			u32 block_size;		/* Size of a compression block
-						   (cb). */
-			u8 block_size_bits;	/* Log2 of the size of a cb. */
-			u8 block_clusters;	/* Number of clusters per cb. */
-		} compressed;
-	} itype;
-	struct mutex extent_lock;	/* Lock for accessing/modifying the
-					   below . */
-	s32 nr_extents;	/* For a base mft record, the number of attached extent
-			   inodes (0 if none), for extent records and for fake
-			   inodes describing an attribute this is -1. */
-	union {		/* This union is only used if nr_extents != 0. */
-		ntfs_inode **extent_ntfs_inos;	/* For nr_extents > 0, array of
-						   the ntfs inodes of the extent
-						   mft records belonging to
-						   this base inode which have
-						   been loaded. */
-		ntfs_inode *base_ntfs_ino;	/* For nr_extents == -1, the
-						   ntfs inode of the base mft
-						   record. For fake inodes, the
-						   real (base) inode to which
-						   the attribute belongs. */
-	} ext;
-};
-
-/*
- * Defined bits for the state field in the ntfs_inode structure.
- * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only
- */
-typedef enum {
-	NI_Dirty,		/* 1: Mft record needs to be written to disk. */
-	NI_AttrList,		/* 1: Mft record contains an attribute list. */
-	NI_AttrListNonResident,	/* 1: Attribute list is non-resident. Implies
-				      NI_AttrList is set. */
-
-	NI_Attr,		/* 1: Fake inode for attribute i/o.
-				   0: Real inode or extent inode. */
-
-	NI_MstProtected,	/* 1: Attribute is protected by MST fixups.
-				   0: Attribute is not protected by fixups. */
-	NI_NonResident,		/* 1: Unnamed data attr is non-resident (f).
-				   1: Attribute is non-resident (a). */
-	NI_IndexAllocPresent = NI_NonResident,	/* 1: $I30 index alloc attr is
-						   present (d). */
-	NI_Compressed,		/* 1: Unnamed data attr is compressed (f).
-				   1: Create compressed files by default (d).
-				   1: Attribute is compressed (a). */
-	NI_Encrypted,		/* 1: Unnamed data attr is encrypted (f).
-				   1: Create encrypted files by default (d).
-				   1: Attribute is encrypted (a). */
-	NI_Sparse,		/* 1: Unnamed data attr is sparse (f).
-				   1: Create sparse files by default (d).
-				   1: Attribute is sparse (a). */
-	NI_SparseDisabled,	/* 1: May not create sparse regions. */
-	NI_TruncateFailed,	/* 1: Last ntfs_truncate() call failed. */
-} ntfs_inode_state_bits;
-
-/*
- * NOTE: We should be adding dirty mft records to a list somewhere and they
- * should be independent of the (ntfs/vfs) inode structure so that an inode can
- * be removed but the record can be left dirty for syncing later.
- */
-
-/*
- * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo()
- * functions.
- */
-#define NINO_FNS(flag)					\
-static inline int NIno##flag(ntfs_inode *ni)		\
-{							\
-	return test_bit(NI_##flag, &(ni)->state);	\
-}							\
-static inline void NInoSet##flag(ntfs_inode *ni)	\
-{							\
-	set_bit(NI_##flag, &(ni)->state);		\
-}							\
-static inline void NInoClear##flag(ntfs_inode *ni)	\
-{							\
-	clear_bit(NI_##flag, &(ni)->state);		\
-}
-
-/*
- * As above for NInoTestSetFoo() and NInoTestClearFoo().
- */
-#define TAS_NINO_FNS(flag)					\
-static inline int NInoTestSet##flag(ntfs_inode *ni)		\
-{								\
-	return test_and_set_bit(NI_##flag, &(ni)->state);	\
-}								\
-static inline int NInoTestClear##flag(ntfs_inode *ni)		\
-{								\
-	return test_and_clear_bit(NI_##flag, &(ni)->state);	\
-}
-
-/* Emit the ntfs inode bitops functions. */
-NINO_FNS(Dirty)
-TAS_NINO_FNS(Dirty)
-NINO_FNS(AttrList)
-NINO_FNS(AttrListNonResident)
-NINO_FNS(Attr)
-NINO_FNS(MstProtected)
-NINO_FNS(NonResident)
-NINO_FNS(IndexAllocPresent)
-NINO_FNS(Compressed)
-NINO_FNS(Encrypted)
-NINO_FNS(Sparse)
-NINO_FNS(SparseDisabled)
-NINO_FNS(TruncateFailed)
-
-/*
- * The full structure containing a ntfs_inode and a vfs struct inode. Used for
- * all real and fake inodes but not for extent inodes which lack the vfs struct
- * inode.
- */
-typedef struct {
-	ntfs_inode ntfs_inode;
-	struct inode vfs_inode;		/* The vfs inode structure. */
-} big_ntfs_inode;
-
-/**
- * NTFS_I - return the ntfs inode given a vfs inode
- * @inode:	VFS inode
- *
- * NTFS_I() returns the ntfs inode associated with the VFS @inode.
- */
-static inline ntfs_inode *NTFS_I(struct inode *inode)
-{
-	return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode);
-}
-
-static inline struct inode *VFS_I(ntfs_inode *ni)
-{
-	return &((big_ntfs_inode *)ni)->vfs_inode;
-}
-
-/**
- * ntfs_attr - ntfs in memory attribute structure
- * @mft_no:	mft record number of the base mft record of this attribute
- * @name:	Unicode name of the attribute (NULL if unnamed)
- * @name_len:	length of @name in Unicode characters (0 if unnamed)
- * @type:	attribute type (see layout.h)
- *
- * This structure exists only to provide a small structure for the
- * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism.
- *
- * NOTE: Elements are ordered by size to make the structure as compact as
- * possible on all architectures.
- */
-typedef struct {
-	unsigned long mft_no;
-	ntfschar *name;
-	u32 name_len;
-	ATTR_TYPE type;
-} ntfs_attr;
-
-extern int ntfs_test_inode(struct inode *vi, void *data);
-
-extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
-extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
-		ntfschar *name, u32 name_len);
-extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
-		u32 name_len);
-
-extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
-extern void ntfs_free_big_inode(struct inode *inode);
-extern void ntfs_evict_big_inode(struct inode *vi);
-
-extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
-
-static inline void ntfs_init_big_inode(struct inode *vi)
-{
-	ntfs_inode *ni = NTFS_I(vi);
-
-	ntfs_debug("Entering.");
-	__ntfs_init_inode(vi->i_sb, ni);
-	ni->mft_no = vi->i_ino;
-}
-
-extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
-		unsigned long mft_no);
-extern void ntfs_clear_extent_inode(ntfs_inode *ni);
-
-extern int ntfs_read_inode_mount(struct inode *vi);
-
-extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
-
-#ifdef NTFS_RW
-
-extern int ntfs_truncate(struct inode *vi);
-extern void ntfs_truncate_vfs(struct inode *vi);
-
-extern int ntfs_setattr(struct mnt_idmap *idmap,
-			struct dentry *dentry, struct iattr *attr);
-
-extern int __ntfs_write_inode(struct inode *vi, int sync);
-
-static inline void ntfs_commit_inode(struct inode *vi)
-{
-	if (!is_bad_inode(vi))
-		__ntfs_write_inode(vi, 1);
-	return;
-}
-
-#else
-
-static inline void ntfs_truncate_vfs(struct inode *vi) {}
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
deleted file mode 100644
index 5d4bf7a3259f..000000000000
--- a/fs/ntfs/layout.h
+++ /dev/null
@@ -1,2421 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
- *	      project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_LAYOUT_H
-#define _LINUX_NTFS_LAYOUT_H
-
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/list.h>
-#include <asm/byteorder.h>
-
-#include "types.h"
-
-/* The NTFS oem_id "NTFS    " */
-#define magicNTFS	cpu_to_le64(0x202020205346544eULL)
-
-/*
- * Location of bootsector on partition:
- *	The standard NTFS_BOOT_SECTOR is on sector 0 of the partition.
- *	On NT4 and above there is one backup copy of the boot sector to
- *	be found on the last sector of the partition (not normally accessible
- *	from within Windows as the bootsector contained number of sectors
- *	value is one less than the actual value!).
- *	On versions of NT 3.51 and earlier, the backup copy was located at
- *	number of sectors/2 (integer divide), i.e. in the middle of the volume.
- */
-
-/*
- * BIOS parameter block (bpb) structure.
- */
-typedef struct {
-	le16 bytes_per_sector;		/* Size of a sector in bytes. */
-	u8  sectors_per_cluster;	/* Size of a cluster in sectors. */
-	le16 reserved_sectors;		/* zero */
-	u8  fats;			/* zero */
-	le16 root_entries;		/* zero */
-	le16 sectors;			/* zero */
-	u8  media_type;			/* 0xf8 = hard disk */
-	le16 sectors_per_fat;		/* zero */
-	le16 sectors_per_track;		/* irrelevant */
-	le16 heads;			/* irrelevant */
-	le32 hidden_sectors;		/* zero */
-	le32 large_sectors;		/* zero */
-} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK;
-
-/*
- * NTFS boot sector structure.
- */
-typedef struct {
-	u8  jump[3];			/* Irrelevant (jump to boot up code).*/
-	le64 oem_id;			/* Magic "NTFS    ". */
-	BIOS_PARAMETER_BLOCK bpb;	/* See BIOS_PARAMETER_BLOCK. */
-	u8  unused[4];			/* zero, NTFS diskedit.exe states that
-					   this is actually:
-						__u8 physical_drive;	// 0x80
-						__u8 current_head;	// zero
-						__u8 extended_boot_signature;
-									// 0x80
-						__u8 unused;		// zero
-					 */
-/*0x28*/sle64 number_of_sectors;	/* Number of sectors in volume. Gives
-					   maximum volume size of 2^63 sectors.
-					   Assuming standard sector size of 512
-					   bytes, the maximum byte size is
-					   approx. 4.7x10^21 bytes. (-; */
-	sle64 mft_lcn;			/* Cluster location of mft data. */
-	sle64 mftmirr_lcn;		/* Cluster location of copy of mft. */
-	s8  clusters_per_mft_record;	/* Mft record size in clusters. */
-	u8  reserved0[3];		/* zero */
-	s8  clusters_per_index_record;	/* Index block size in clusters. */
-	u8  reserved1[3];		/* zero */
-	le64 volume_serial_number;	/* Irrelevant (serial number). */
-	le32 checksum;			/* Boot sector checksum. */
-/*0x54*/u8  bootstrap[426];		/* Irrelevant (boot up code). */
-	le16 end_of_sector_marker;	/* End of bootsector magic. Always is
-					   0xaa55 in little endian. */
-/* sizeof() = 512 (0x200) bytes */
-} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR;
-
-/*
- * Magic identifiers present at the beginning of all ntfs record containing
- * records (like mft records for example).
- */
-enum {
-	/* Found in $MFT/$DATA. */
-	magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
-	magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
-	magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
-
-	/* Found in $LogFile/$DATA. */
-	magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
-	magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
-
-	/* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
-	magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
-
-	/* Found in all ntfs record containing records. */
-	magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
-						       transfer was detected. */
-	/*
-	 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
-	 * thus not initialized.  Page must be initialized before using it.
-	 */
-	magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
-};
-
-typedef le32 NTFS_RECORD_TYPE;
-
-/*
- * Generic magic comparison macros. Finally found a use for the ## preprocessor
- * operator! (-8
- */
-
-static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
-{
-	return (x == r);
-}
-#define ntfs_is_magic(x, m)	__ntfs_is_magic(x, magic_##m)
-
-static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
-{
-	return (*p == r);
-}
-#define ntfs_is_magicp(p, m)	__ntfs_is_magicp(p, magic_##m)
-
-/*
- * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above.
- */
-#define ntfs_is_file_record(x)		( ntfs_is_magic (x, FILE) )
-#define ntfs_is_file_recordp(p)		( ntfs_is_magicp(p, FILE) )
-#define ntfs_is_mft_record(x)		( ntfs_is_file_record (x) )
-#define ntfs_is_mft_recordp(p)		( ntfs_is_file_recordp(p) )
-#define ntfs_is_indx_record(x)		( ntfs_is_magic (x, INDX) )
-#define ntfs_is_indx_recordp(p)		( ntfs_is_magicp(p, INDX) )
-#define ntfs_is_hole_record(x)		( ntfs_is_magic (x, HOLE) )
-#define ntfs_is_hole_recordp(p)		( ntfs_is_magicp(p, HOLE) )
-
-#define ntfs_is_rstr_record(x)		( ntfs_is_magic (x, RSTR) )
-#define ntfs_is_rstr_recordp(p)		( ntfs_is_magicp(p, RSTR) )
-#define ntfs_is_rcrd_record(x)		( ntfs_is_magic (x, RCRD) )
-#define ntfs_is_rcrd_recordp(p)		( ntfs_is_magicp(p, RCRD) )
-
-#define ntfs_is_chkd_record(x)		( ntfs_is_magic (x, CHKD) )
-#define ntfs_is_chkd_recordp(p)		( ntfs_is_magicp(p, CHKD) )
-
-#define ntfs_is_baad_record(x)		( ntfs_is_magic (x, BAAD) )
-#define ntfs_is_baad_recordp(p)		( ntfs_is_magicp(p, BAAD) )
-
-#define ntfs_is_empty_record(x)		( ntfs_is_magic (x, empty) )
-#define ntfs_is_empty_recordp(p)	( ntfs_is_magicp(p, empty) )
-
-/*
- * The Update Sequence Array (usa) is an array of the le16 values which belong
- * to the end of each sector protected by the update sequence record in which
- * this array is contained. Note that the first entry is the Update Sequence
- * Number (usn), a cyclic counter of how many times the protected record has
- * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All
- * last le16's of each sector have to be equal to the usn (during reading) or
- * are set to it (during writing). If they are not, an incomplete multi sector
- * transfer has occurred when the data was written.
- * The maximum size for the update sequence array is fixed to:
- *	maximum size = usa_ofs + (usa_count * 2) = 510 bytes
- * The 510 bytes comes from the fact that the last le16 in the array has to
- * (obviously) finish before the last le16 of the first 512-byte sector.
- * This formula can be used as a consistency check in that usa_ofs +
- * (usa_count * 2) has to be less than or equal to 510.
- */
-typedef struct {
-	NTFS_RECORD_TYPE magic;	/* A four-byte magic identifying the record
-				   type and/or status. */
-	le16 usa_ofs;		/* Offset to the Update Sequence Array (usa)
-				   from the start of the ntfs record. */
-	le16 usa_count;		/* Number of le16 sized entries in the usa
-				   including the Update Sequence Number (usn),
-				   thus the number of fixups is the usa_count
-				   minus 1. */
-} __attribute__ ((__packed__)) NTFS_RECORD;
-
-/*
- * System files mft record numbers. All these files are always marked as used
- * in the bitmap attribute of the mft; presumably in order to avoid accidental
- * allocation for random other mft records. Also, the sequence number for each
- * of the system files is always equal to their mft record number and it is
- * never modified.
- */
-typedef enum {
-	FILE_MFT       = 0,	/* Master file table (mft). Data attribute
-				   contains the entries and bitmap attribute
-				   records which ones are in use (bit==1). */
-	FILE_MFTMirr   = 1,	/* Mft mirror: copy of first four mft records
-				   in data attribute. If cluster size > 4kiB,
-				   copy of first N mft records, with
-					N = cluster_size / mft_record_size. */
-	FILE_LogFile   = 2,	/* Journalling log in data attribute. */
-	FILE_Volume    = 3,	/* Volume name attribute and volume information
-				   attribute (flags and ntfs version). Windows
-				   refers to this file as volume DASD (Direct
-				   Access Storage Device). */
-	FILE_AttrDef   = 4,	/* Array of attribute definitions in data
-				   attribute. */
-	FILE_root      = 5,	/* Root directory. */
-	FILE_Bitmap    = 6,	/* Allocation bitmap of all clusters (lcns) in
-				   data attribute. */
-	FILE_Boot      = 7,	/* Boot sector (always at cluster 0) in data
-				   attribute. */
-	FILE_BadClus   = 8,	/* Contains all bad clusters in the non-resident
-				   data attribute. */
-	FILE_Secure    = 9,	/* Shared security descriptors in data attribute
-				   and two indexes into the descriptors.
-				   Appeared in Windows 2000. Before that, this
-				   file was named $Quota but was unused. */
-	FILE_UpCase    = 10,	/* Uppercase equivalents of all 65536 Unicode
-				   characters in data attribute. */
-	FILE_Extend    = 11,	/* Directory containing other system files (eg.
-				   $ObjId, $Quota, $Reparse and $UsnJrnl). This
-				   is new to NTFS3.0. */
-	FILE_reserved12 = 12,	/* Reserved for future use (records 12-15). */
-	FILE_reserved13 = 13,
-	FILE_reserved14 = 14,
-	FILE_reserved15 = 15,
-	FILE_first_user = 16,	/* First user file, used as test limit for
-				   whether to allow opening a file or not. */
-} NTFS_SYSTEM_FILES;
-
-/*
- * These are the so far known MFT_RECORD_* flags (16-bit) which contain
- * information about the mft record in which they are present.
- */
-enum {
-	MFT_RECORD_IN_USE	= cpu_to_le16(0x0001),
-	MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
-} __attribute__ ((__packed__));
-
-typedef le16 MFT_RECORD_FLAGS;
-
-/*
- * mft references (aka file references or file record segment references) are
- * used whenever a structure needs to refer to a record in the mft.
- *
- * A reference consists of a 48-bit index into the mft and a 16-bit sequence
- * number used to detect stale references.
- *
- * For error reporting purposes we treat the 48-bit index as a signed quantity.
- *
- * The sequence number is a circular counter (skipping 0) describing how many
- * times the referenced mft record has been (re)used. This has to match the
- * sequence number of the mft record being referenced, otherwise the reference
- * is considered stale and removed (FIXME: only ntfsck or the driver itself?).
- *
- * If the sequence number is zero it is assumed that no sequence number
- * consistency checking should be performed.
- *
- * FIXME: Since inodes are 32-bit as of now, the driver needs to always check
- * for high_part being 0 and if not either BUG(), cause a panic() or handle
- * the situation in some other way. This shouldn't be a problem as a volume has
- * to become HUGE in order to need more than 32-bits worth of mft records.
- * Assuming the standard mft record size of 1kb only the records (never mind
- * the non-resident attributes, etc.) would require 4Tb of space on their own
- * for the first 32 bits worth of records. This is only if some strange person
- * doesn't decide to foul play and make the mft sparse which would be a really
- * horrible thing to do as it would trash our current driver implementation. )-:
- * Do I hear screams "we want 64-bit inodes!" ?!? (-;
- *
- * FIXME: The mft zone is defined as the first 12% of the volume. This space is
- * reserved so that the mft can grow contiguously and hence doesn't become
- * fragmented. Volume free space includes the empty part of the mft zone and
- * when the volume's free 88% are used up, the mft zone is shrunk by a factor
- * of 2, thus making more space available for more files/data. This process is
- * repeated every time there is no more free space except for the mft zone until
- * there really is no more free space.
- */
-
-/*
- * Typedef the MFT_REF as a 64-bit value for easier handling.
- * Also define two unpacking macros to get to the reference (MREF) and
- * sequence number (MSEQNO) respectively.
- * The _LE versions are to be applied on little endian MFT_REFs.
- * Note: The _LE versions will return a CPU endian formatted value!
- */
-#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
-#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
-
-typedef u64 MFT_REF;
-typedef le64 leMFT_REF;
-
-#define MK_MREF(m, s)	((MFT_REF)(((MFT_REF)(s) << 48) |		\
-					((MFT_REF)(m) & MFT_REF_MASK_CPU)))
-#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
-
-#define MREF(x)		((unsigned long)((x) & MFT_REF_MASK_CPU))
-#define MSEQNO(x)	((u16)(((x) >> 48) & 0xffff))
-#define MREF_LE(x)	((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
-#define MSEQNO_LE(x)	((u16)((le64_to_cpu(x) >> 48) & 0xffff))
-
-#define IS_ERR_MREF(x)	(((x) & 0x0000800000000000ULL) ? true : false)
-#define ERR_MREF(x)	((u64)((s64)(x)))
-#define MREF_ERR(x)	((int)((s64)(x)))
-
-/*
- * The mft record header present at the beginning of every record in the mft.
- * This is followed by a sequence of variable length attribute records which
- * is terminated by an attribute of type AT_END which is a truncated attribute
- * in that it only consists of the attribute type code AT_END and none of the
- * other members of the attribute structure are present.
- */
-typedef struct {
-/*Ofs*/
-/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-	NTFS_RECORD_TYPE magic;	/* Usually the magic is "FILE". */
-	le16 usa_ofs;		/* See NTFS_RECORD definition above. */
-	le16 usa_count;		/* See NTFS_RECORD definition above. */
-
-/*  8*/	le64 lsn;		/* $LogFile sequence number for this record.
-				   Changed every time the record is modified. */
-/* 16*/	le16 sequence_number;	/* Number of times this mft record has been
-				   reused. (See description for MFT_REF
-				   above.) NOTE: The increment (skipping zero)
-				   is done when the file is deleted. NOTE: If
-				   this is zero it is left zero. */
-/* 18*/	le16 link_count;	/* Number of hard links, i.e. the number of
-				   directory entries referencing this record.
-				   NOTE: Only used in mft base records.
-				   NOTE: When deleting a directory entry we
-				   check the link_count and if it is 1 we
-				   delete the file. Otherwise we delete the
-				   FILE_NAME_ATTR being referenced by the
-				   directory entry from the mft record and
-				   decrement the link_count.
-				   FIXME: Careful with Win32 + DOS names! */
-/* 20*/	le16 attrs_offset;	/* Byte offset to the first attribute in this
-				   mft record from the start of the mft record.
-				   NOTE: Must be aligned to 8-byte boundary. */
-/* 22*/	MFT_RECORD_FLAGS flags;	/* Bit array of MFT_RECORD_FLAGS. When a file
-				   is deleted, the MFT_RECORD_IN_USE flag is
-				   set to zero. */
-/* 24*/	le32 bytes_in_use;	/* Number of bytes used in this mft record.
-				   NOTE: Must be aligned to 8-byte boundary. */
-/* 28*/	le32 bytes_allocated;	/* Number of bytes allocated for this mft
-				   record. This should be equal to the mft
-				   record size. */
-/* 32*/	leMFT_REF base_mft_record;/* This is zero for base mft records.
-				   When it is not zero it is a mft reference
-				   pointing to the base mft record to which
-				   this record belongs (this is then used to
-				   locate the attribute list attribute present
-				   in the base record which describes this
-				   extension record and hence might need
-				   modification when the extension record
-				   itself is modified, also locating the
-				   attribute list also means finding the other
-				   potential extents, belonging to the non-base
-				   mft record). */
-/* 40*/	le16 next_attr_instance;/* The instance number that will be assigned to
-				   the next attribute added to this mft record.
-				   NOTE: Incremented each time after it is used.
-				   NOTE: Every time the mft record is reused
-				   this number is set to zero.  NOTE: The first
-				   instance number is always 0. */
-/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
-/* 42*/ le16 reserved;		/* Reserved/alignment. */
-/* 44*/ le32 mft_record_number;	/* Number of this mft record. */
-/* sizeof() = 48 bytes */
-/*
- * When (re)using the mft record, we place the update sequence array at this
- * offset, i.e. before we start with the attributes.  This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work.  As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading we obviously use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) MFT_RECORD;
-
-/* This is the version without the NTFS 3.1+ specific fields. */
-typedef struct {
-/*Ofs*/
-/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-	NTFS_RECORD_TYPE magic;	/* Usually the magic is "FILE". */
-	le16 usa_ofs;		/* See NTFS_RECORD definition above. */
-	le16 usa_count;		/* See NTFS_RECORD definition above. */
-
-/*  8*/	le64 lsn;		/* $LogFile sequence number for this record.
-				   Changed every time the record is modified. */
-/* 16*/	le16 sequence_number;	/* Number of times this mft record has been
-				   reused. (See description for MFT_REF
-				   above.) NOTE: The increment (skipping zero)
-				   is done when the file is deleted. NOTE: If
-				   this is zero it is left zero. */
-/* 18*/	le16 link_count;	/* Number of hard links, i.e. the number of
-				   directory entries referencing this record.
-				   NOTE: Only used in mft base records.
-				   NOTE: When deleting a directory entry we
-				   check the link_count and if it is 1 we
-				   delete the file. Otherwise we delete the
-				   FILE_NAME_ATTR being referenced by the
-				   directory entry from the mft record and
-				   decrement the link_count.
-				   FIXME: Careful with Win32 + DOS names! */
-/* 20*/	le16 attrs_offset;	/* Byte offset to the first attribute in this
-				   mft record from the start of the mft record.
-				   NOTE: Must be aligned to 8-byte boundary. */
-/* 22*/	MFT_RECORD_FLAGS flags;	/* Bit array of MFT_RECORD_FLAGS. When a file
-				   is deleted, the MFT_RECORD_IN_USE flag is
-				   set to zero. */
-/* 24*/	le32 bytes_in_use;	/* Number of bytes used in this mft record.
-				   NOTE: Must be aligned to 8-byte boundary. */
-/* 28*/	le32 bytes_allocated;	/* Number of bytes allocated for this mft
-				   record. This should be equal to the mft
-				   record size. */
-/* 32*/	leMFT_REF base_mft_record;/* This is zero for base mft records.
-				   When it is not zero it is a mft reference
-				   pointing to the base mft record to which
-				   this record belongs (this is then used to
-				   locate the attribute list attribute present
-				   in the base record which describes this
-				   extension record and hence might need
-				   modification when the extension record
-				   itself is modified, also locating the
-				   attribute list also means finding the other
-				   potential extents, belonging to the non-base
-				   mft record). */
-/* 40*/	le16 next_attr_instance;/* The instance number that will be assigned to
-				   the next attribute added to this mft record.
-				   NOTE: Incremented each time after it is used.
-				   NOTE: Every time the mft record is reused
-				   this number is set to zero.  NOTE: The first
-				   instance number is always 0. */
-/* sizeof() = 42 bytes */
-/*
- * When (re)using the mft record, we place the update sequence array at this
- * offset, i.e. before we start with the attributes.  This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work.  As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading we obviously use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) MFT_RECORD_OLD;
-
-/*
- * System defined attributes (32-bit).  Each attribute type has a corresponding
- * attribute name (Unicode string of maximum 64 character length) as described
- * by the attribute definitions present in the data attribute of the $AttrDef
- * system file.  On NTFS 3.0 volumes the names are just as the types are named
- * in the below defines exchanging AT_ for the dollar sign ($).  If that is not
- * a revealing choice of symbol I do not know what is... (-;
- */
-enum {
-	AT_UNUSED			= cpu_to_le32(         0),
-	AT_STANDARD_INFORMATION		= cpu_to_le32(      0x10),
-	AT_ATTRIBUTE_LIST		= cpu_to_le32(      0x20),
-	AT_FILE_NAME			= cpu_to_le32(      0x30),
-	AT_OBJECT_ID			= cpu_to_le32(      0x40),
-	AT_SECURITY_DESCRIPTOR		= cpu_to_le32(      0x50),
-	AT_VOLUME_NAME			= cpu_to_le32(      0x60),
-	AT_VOLUME_INFORMATION		= cpu_to_le32(      0x70),
-	AT_DATA				= cpu_to_le32(      0x80),
-	AT_INDEX_ROOT			= cpu_to_le32(      0x90),
-	AT_INDEX_ALLOCATION		= cpu_to_le32(      0xa0),
-	AT_BITMAP			= cpu_to_le32(      0xb0),
-	AT_REPARSE_POINT		= cpu_to_le32(      0xc0),
-	AT_EA_INFORMATION		= cpu_to_le32(      0xd0),
-	AT_EA				= cpu_to_le32(      0xe0),
-	AT_PROPERTY_SET			= cpu_to_le32(      0xf0),
-	AT_LOGGED_UTILITY_STREAM	= cpu_to_le32(     0x100),
-	AT_FIRST_USER_DEFINED_ATTRIBUTE	= cpu_to_le32(    0x1000),
-	AT_END				= cpu_to_le32(0xffffffff)
-};
-
-typedef le32 ATTR_TYPE;
-
-/*
- * The collation rules for sorting views/indexes/etc (32-bit).
- *
- * COLLATION_BINARY - Collate by binary compare where the first byte is most
- *	significant.
- * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary
- *	Unicode values, except that when a character can be uppercased, the
- *	upper case value collates before the lower case one.
- * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation
- *	is done very much like COLLATION_UNICODE_STRING. In fact I have no idea
- *	what the difference is. Perhaps the difference is that file names
- *	would treat some special characters in an odd way (see
- *	unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[]
- *	for what I mean but COLLATION_UNICODE_STRING would not give any special
- *	treatment to any characters at all, but this is speculation.
- * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key
- *	values. E.g. used for $SII index in FILE_Secure, which sorts by
- *	security_id (le32).
- * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values.
- *	E.g. used for $O index in FILE_Extend/$Quota.
- * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash
- *	values and second by ascending security_id values. E.g. used for $SDH
- *	index in FILE_Secure.
- * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending
- *	le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which
- *	sorts by object_id (16-byte), by splitting up the object_id in four
- *	le32 values and using them as individual keys. E.g. take the following
- *	two security_ids, stored as follows on disk:
- *		1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59
- *		2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45
- *	To compare them, they are split into four le32 values each, like so:
- *		1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081
- *		2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179
- *	Now, it is apparent why the 2nd object_id collates after the 1st: the
- *	first le32 value of the 1st object_id is less than the first le32 of
- *	the 2nd object_id. If the first le32 values of both object_ids were
- *	equal then the second le32 values would be compared, etc.
- */
-enum {
-	COLLATION_BINARY		= cpu_to_le32(0x00),
-	COLLATION_FILE_NAME		= cpu_to_le32(0x01),
-	COLLATION_UNICODE_STRING	= cpu_to_le32(0x02),
-	COLLATION_NTOFS_ULONG		= cpu_to_le32(0x10),
-	COLLATION_NTOFS_SID		= cpu_to_le32(0x11),
-	COLLATION_NTOFS_SECURITY_HASH	= cpu_to_le32(0x12),
-	COLLATION_NTOFS_ULONGS		= cpu_to_le32(0x13),
-};
-
-typedef le32 COLLATION_RULE;
-
-/*
- * The flags (32-bit) describing attribute properties in the attribute
- * definition structure.  FIXME: This information is based on Regis's
- * information and, according to him, it is not certain and probably
- * incomplete.  The INDEXABLE flag is fairly certainly correct as only the file
- * name attribute has this flag set and this is the only attribute indexed in
- * NT4.
- */
-enum {
-	ATTR_DEF_INDEXABLE	= cpu_to_le32(0x02), /* Attribute can be
-					indexed. */
-	ATTR_DEF_MULTIPLE	= cpu_to_le32(0x04), /* Attribute type
-					can be present multiple times in the
-					mft records of an inode. */
-	ATTR_DEF_NOT_ZERO	= cpu_to_le32(0x08), /* Attribute value
-					must contain at least one non-zero
-					byte. */
-	ATTR_DEF_INDEXED_UNIQUE	= cpu_to_le32(0x10), /* Attribute must be
-					indexed and the attribute value must be
-					unique for the attribute type in all of
-					the mft records of an inode. */
-	ATTR_DEF_NAMED_UNIQUE	= cpu_to_le32(0x20), /* Attribute must be
-					named and the name must be unique for
-					the attribute type in all of the mft
-					records of an inode. */
-	ATTR_DEF_RESIDENT	= cpu_to_le32(0x40), /* Attribute must be
-					resident. */
-	ATTR_DEF_ALWAYS_LOG	= cpu_to_le32(0x80), /* Always log
-					modifications to this attribute,
-					regardless of whether it is resident or
-					non-resident.  Without this, only log
-					modifications if the attribute is
-					resident. */
-};
-
-typedef le32 ATTR_DEF_FLAGS;
-
-/*
- * The data attribute of FILE_AttrDef contains a sequence of attribute
- * definitions for the NTFS volume. With this, it is supposed to be safe for an
- * older NTFS driver to mount a volume containing a newer NTFS version without
- * damaging it (that's the theory. In practice it's: not damaging it too much).
- * Entries are sorted by attribute type. The flags describe whether the
- * attribute can be resident/non-resident and possibly other things, but the
- * actual bits are unknown.
- */
-typedef struct {
-/*hex ofs*/
-/*  0*/	ntfschar name[0x40];		/* Unicode name of the attribute. Zero
-					   terminated. */
-/* 80*/	ATTR_TYPE type;			/* Type of the attribute. */
-/* 84*/	le32 display_rule;		/* Default display rule.
-					   FIXME: What does it mean? (AIA) */
-/* 88*/ COLLATION_RULE collation_rule;	/* Default collation rule. */
-/* 8c*/	ATTR_DEF_FLAGS flags;		/* Flags describing the attribute. */
-/* 90*/	sle64 min_size;			/* Optional minimum attribute size. */
-/* 98*/	sle64 max_size;			/* Maximum size of attribute. */
-/* sizeof() = 0xa0 or 160 bytes */
-} __attribute__ ((__packed__)) ATTR_DEF;
-
-/*
- * Attribute flags (16-bit).
- */
-enum {
-	ATTR_IS_COMPRESSED    = cpu_to_le16(0x0001),
-	ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
-							      mask.  Also, first
-							      illegal value. */
-	ATTR_IS_ENCRYPTED     = cpu_to_le16(0x4000),
-	ATTR_IS_SPARSE	      = cpu_to_le16(0x8000),
-} __attribute__ ((__packed__));
-
-typedef le16 ATTR_FLAGS;
-
-/*
- * Attribute compression.
- *
- * Only the data attribute is ever compressed in the current ntfs driver in
- * Windows. Further, compression is only applied when the data attribute is
- * non-resident. Finally, to use compression, the maximum allowed cluster size
- * on a volume is 4kib.
- *
- * The compression method is based on independently compressing blocks of X
- * clusters, where X is determined from the compression_unit value found in the
- * non-resident attribute record header (more precisely: X = 2^compression_unit
- * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4).
- *
- * There are three different cases of how a compression block of X clusters
- * can be stored:
- *
- *   1) The data in the block is all zero (a sparse block):
- *	  This is stored as a sparse block in the runlist, i.e. the runlist
- *	  entry has length = X and lcn = -1. The mapping pairs array actually
- *	  uses a delta_lcn value length of 0, i.e. delta_lcn is not present at
- *	  all, which is then interpreted by the driver as lcn = -1.
- *	  NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then
- *	  the same principles apply as above, except that the length is not
- *	  restricted to being any particular value.
- *
- *   2) The data in the block is not compressed:
- *	  This happens when compression doesn't reduce the size of the block
- *	  in clusters. I.e. if compression has a small effect so that the
- *	  compressed data still occupies X clusters, then the uncompressed data
- *	  is stored in the block.
- *	  This case is recognised by the fact that the runlist entry has
- *	  length = X and lcn >= 0. The mapping pairs array stores this as
- *	  normal with a run length of X and some specific delta_lcn, i.e.
- *	  delta_lcn has to be present.
- *
- *   3) The data in the block is compressed:
- *	  The common case. This case is recognised by the fact that the run
- *	  list entry has length L < X and lcn >= 0. The mapping pairs array
- *	  stores this as normal with a run length of X and some specific
- *	  delta_lcn, i.e. delta_lcn has to be present. This runlist entry is
- *	  immediately followed by a sparse entry with length = X - L and
- *	  lcn = -1. The latter entry is to make up the vcn counting to the
- *	  full compression block size X.
- *
- * In fact, life is more complicated because adjacent entries of the same type
- * can be coalesced. This means that one has to keep track of the number of
- * clusters handled and work on a basis of X clusters at a time being one
- * block. An example: if length L > X this means that this particular runlist
- * entry contains a block of length X and part of one or more blocks of length
- * L - X. Another example: if length L < X, this does not necessarily mean that
- * the block is compressed as it might be that the lcn changes inside the block
- * and hence the following runlist entry describes the continuation of the
- * potentially compressed block. The block would be compressed if the
- * following runlist entry describes at least X - L sparse clusters, thus
- * making up the compression block length as described in point 3 above. (Of
- * course, there can be several runlist entries with small lengths so that the
- * sparse entry does not follow the first data containing entry with
- * length < X.)
- *
- * NOTE: At the end of the compressed attribute value, there most likely is not
- * just the right amount of data to make up a compression block, thus this data
- * is not even attempted to be compressed. It is just stored as is, unless
- * the number of clusters it occupies is reduced when compressed in which case
- * it is stored as a compressed compression block, complete with sparse
- * clusters at the end.
- */
-
-/*
- * Flags of resident attributes (8-bit).
- */
-enum {
-	RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index
-					    (has implications for deleting and
-					    modifying the attribute). */
-} __attribute__ ((__packed__));
-
-typedef u8 RESIDENT_ATTR_FLAGS;
-
-/*
- * Attribute record header. Always aligned to 8-byte boundary.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	ATTR_TYPE type;		/* The (32-bit) type of the attribute. */
-/*  4*/	le32 length;		/* Byte size of the resident part of the
-				   attribute (aligned to 8-byte boundary).
-				   Used to get to the next attribute. */
-/*  8*/	u8 non_resident;	/* If 0, attribute is resident.
-				   If 1, attribute is non-resident. */
-/*  9*/	u8 name_length;		/* Unicode character size of name of attribute.
-				   0 if unnamed. */
-/* 10*/	le16 name_offset;	/* If name_length != 0, the byte offset to the
-				   beginning of the name from the attribute
-				   record. Note that the name is stored as a
-				   Unicode string. When creating, place offset
-				   just at the end of the record header. Then,
-				   follow with attribute value or mapping pairs
-				   array, resident and non-resident attributes
-				   respectively, aligning to an 8-byte
-				   boundary. */
-/* 12*/	ATTR_FLAGS flags;	/* Flags describing the attribute. */
-/* 14*/	le16 instance;		/* The instance of this attribute record. This
-				   number is unique within this mft record (see
-				   MFT_RECORD/next_attribute_instance notes in
-				   mft.h for more details). */
-/* 16*/	union {
-		/* Resident attributes. */
-		struct {
-/* 16 */		le32 value_length;/* Byte size of attribute value. */
-/* 20 */		le16 value_offset;/* Byte offset of the attribute
-					     value from the start of the
-					     attribute record. When creating,
-					     align to 8-byte boundary if we
-					     have a name present as this might
-					     not have a length of a multiple
-					     of 8-bytes. */
-/* 22 */		RESIDENT_ATTR_FLAGS flags; /* See above. */
-/* 23 */		s8 reserved;	  /* Reserved/alignment to 8-byte
-					     boundary. */
-		} __attribute__ ((__packed__)) resident;
-		/* Non-resident attributes. */
-		struct {
-/* 16*/			leVCN lowest_vcn;/* Lowest valid virtual cluster number
-				for this portion of the attribute value or
-				0 if this is the only extent (usually the
-				case). - Only when an attribute list is used
-				does lowest_vcn != 0 ever occur. */
-/* 24*/			leVCN highest_vcn;/* Highest valid vcn of this extent of
-				the attribute value. - Usually there is only one
-				portion, so this usually equals the attribute
-				value size in clusters minus 1. Can be -1 for
-				zero length files. Can be 0 for "single extent"
-				attributes. */
-/* 32*/			le16 mapping_pairs_offset; /* Byte offset from the
-				beginning of the structure to the mapping pairs
-				array which contains the mappings between the
-				vcns and the logical cluster numbers (lcns).
-				When creating, place this at the end of this
-				record header aligned to 8-byte boundary. */
-/* 34*/			u8 compression_unit; /* The compression unit expressed
-				as the log to the base 2 of the number of
-				clusters in a compression unit.  0 means not
-				compressed.  (This effectively limits the
-				compression unit size to be a power of two
-				clusters.)  WinNT4 only uses a value of 4.
-				Sparse files have this set to 0 on XPSP2. */
-/* 35*/			u8 reserved[5];		/* Align to 8-byte boundary. */
-/* The sizes below are only used when lowest_vcn is zero, as otherwise it would
-   be difficult to keep them up-to-date.*/
-/* 40*/			sle64 allocated_size;	/* Byte size of disk space
-				allocated to hold the attribute value. Always
-				is a multiple of the cluster size. When a file
-				is compressed, this field is a multiple of the
-				compression block size (2^compression_unit) and
-				it represents the logically allocated space
-				rather than the actual on disk usage. For this
-				use the compressed_size (see below). */
-/* 48*/			sle64 data_size;	/* Byte size of the attribute
-				value. Can be larger than allocated_size if
-				attribute value is compressed or sparse. */
-/* 56*/			sle64 initialized_size;	/* Byte size of initialized
-				portion of the attribute value. Usually equals
-				data_size. */
-/* sizeof(uncompressed attr) = 64*/
-/* 64*/			sle64 compressed_size;	/* Byte size of the attribute
-				value after compression.  Only present when
-				compressed or sparse.  Always is a multiple of
-				the cluster size.  Represents the actual amount
-				of disk space being used on the disk. */
-/* sizeof(compressed attr) = 72*/
-		} __attribute__ ((__packed__)) non_resident;
-	} __attribute__ ((__packed__)) data;
-} __attribute__ ((__packed__)) ATTR_RECORD;
-
-typedef ATTR_RECORD ATTR_REC;
-
-/*
- * File attribute flags (32-bit) appearing in the file_attributes fields of the
- * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR
- * attributes of MFT_RECORDs and directory index entries.
- *
- * All of the below flags appear in the directory index entries but only some
- * appear in the STANDARD_INFORMATION attribute whilst only some others appear
- * in the FILENAME_ATTR attribute of MFT_RECORDs.  Unless otherwise stated the
- * flags appear in all of the above.
- */
-enum {
-	FILE_ATTR_READONLY		= cpu_to_le32(0x00000001),
-	FILE_ATTR_HIDDEN		= cpu_to_le32(0x00000002),
-	FILE_ATTR_SYSTEM		= cpu_to_le32(0x00000004),
-	/* Old DOS volid. Unused in NT.	= cpu_to_le32(0x00000008), */
-
-	FILE_ATTR_DIRECTORY		= cpu_to_le32(0x00000010),
-	/* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
-	   reserved for the DOS SUBDIRECTORY flag. */
-	FILE_ATTR_ARCHIVE		= cpu_to_le32(0x00000020),
-	FILE_ATTR_DEVICE		= cpu_to_le32(0x00000040),
-	FILE_ATTR_NORMAL		= cpu_to_le32(0x00000080),
-
-	FILE_ATTR_TEMPORARY		= cpu_to_le32(0x00000100),
-	FILE_ATTR_SPARSE_FILE		= cpu_to_le32(0x00000200),
-	FILE_ATTR_REPARSE_POINT		= cpu_to_le32(0x00000400),
-	FILE_ATTR_COMPRESSED		= cpu_to_le32(0x00000800),
-
-	FILE_ATTR_OFFLINE		= cpu_to_le32(0x00001000),
-	FILE_ATTR_NOT_CONTENT_INDEXED	= cpu_to_le32(0x00002000),
-	FILE_ATTR_ENCRYPTED		= cpu_to_le32(0x00004000),
-
-	FILE_ATTR_VALID_FLAGS		= cpu_to_le32(0x00007fb7),
-	/* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
-	   FILE_ATTR_DEVICE and preserves everything else.  This mask is used
-	   to obtain all flags that are valid for reading. */
-	FILE_ATTR_VALID_SET_FLAGS	= cpu_to_le32(0x000031a7),
-	/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
-	   F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
-	   F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
-	   is used to obtain all flags that are valid for setting. */
-	/*
-	 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
-	 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
-	 * attribute of an mft record.
-	 */
-	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= cpu_to_le32(0x10000000),
-	/* Note, this is a copy of the corresponding bit from the mft record,
-	   telling us whether this is a directory or not, i.e. whether it has
-	   an index root attribute or not. */
-	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= cpu_to_le32(0x20000000),
-	/* Note, this is a copy of the corresponding bit from the mft record,
-	   telling us whether this file has a view index present (eg. object id
-	   index, quota index, one of the security indexes or the encrypting
-	   filesystem related indexes). */
-};
-
-typedef le32 FILE_ATTR_FLAGS;
-
-/*
- * NOTE on times in NTFS: All times are in MS standard time format, i.e. they
- * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00
- * universal coordinated time (UTC). (In Linux time starts 1st January 1970,
- * 00:00:00 UTC and is stored as the number of 1-second intervals since then.)
- */
-
-/*
- * Attribute: Standard information (0x10).
- *
- * NOTE: Always resident.
- * NOTE: Present in all base file records on a volume.
- * NOTE: There is conflicting information about the meaning of each of the time
- *	 fields but the meaning as defined below has been verified to be
- *	 correct by practical experimentation on Windows NT4 SP6a and is hence
- *	 assumed to be the one and only correct interpretation.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	sle64 creation_time;		/* Time file was created. Updated when
-					   a filename is changed(?). */
-/*  8*/	sle64 last_data_change_time;	/* Time the data attribute was last
-					   modified. */
-/* 16*/	sle64 last_mft_change_time;	/* Time this mft record was last
-					   modified. */
-/* 24*/	sle64 last_access_time;		/* Approximate time when the file was
-					   last accessed (obviously this is not
-					   updated on read-only volumes). In
-					   Windows this is only updated when
-					   accessed if some time delta has
-					   passed since the last update. Also,
-					   last access time updates can be
-					   disabled altogether for speed. */
-/* 32*/	FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
-/* 36*/	union {
-	/* NTFS 1.2 */
-		struct {
-		/* 36*/	u8 reserved12[12];	/* Reserved/alignment to 8-byte
-						   boundary. */
-		} __attribute__ ((__packed__)) v1;
-	/* sizeof() = 48 bytes */
-	/* NTFS 3.x */
-		struct {
-/*
- * If a volume has been upgraded from a previous NTFS version, then these
- * fields are present only if the file has been accessed since the upgrade.
- * Recognize the difference by comparing the length of the resident attribute
- * value. If it is 48, then the following fields are missing. If it is 72 then
- * the fields are present. Maybe just check like this:
- *	if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) {
- *		Assume NTFS 1.2- format.
- *		If (volume version is 3.x)
- *			Upgrade attribute to NTFS 3.x format.
- *		else
- *			Use NTFS 1.2- format for access.
- *	} else
- *		Use NTFS 3.x format for access.
- * Only problem is that it might be legal to set the length of the value to
- * arbitrarily large values thus spoiling this check. - But chkdsk probably
- * views that as a corruption, assuming that it behaves like this for all
- * attributes.
- */
-		/* 36*/	le32 maximum_versions;	/* Maximum allowed versions for
-				file. Zero if version numbering is disabled. */
-		/* 40*/	le32 version_number;	/* This file's version (if any).
-				Set to zero if maximum_versions is zero. */
-		/* 44*/	le32 class_id;		/* Class id from bidirectional
-				class id index (?). */
-		/* 48*/	le32 owner_id;		/* Owner_id of the user owning
-				the file. Translate via $Q index in FILE_Extend
-				/$Quota to the quota control entry for the user
-				owning the file. Zero if quotas are disabled. */
-		/* 52*/	le32 security_id;	/* Security_id for the file.
-				Translate via $SII index and $SDS data stream
-				in FILE_Secure to the security descriptor. */
-		/* 56*/	le64 quota_charged;	/* Byte size of the charge to
-				the quota for all streams of the file. Note: Is
-				zero if quotas are disabled. */
-		/* 64*/	leUSN usn;		/* Last update sequence number
-				of the file.  This is a direct index into the
-				transaction log file ($UsnJrnl).  It is zero if
-				the usn journal is disabled or this file has
-				not been subject to logging yet.  See usnjrnl.h
-				for details. */
-		} __attribute__ ((__packed__)) v3;
-	/* sizeof() = 72 bytes (NTFS 3.x) */
-	} __attribute__ ((__packed__)) ver;
-} __attribute__ ((__packed__)) STANDARD_INFORMATION;
-
-/*
- * Attribute: Attribute list (0x20).
- *
- * - Can be either resident or non-resident.
- * - Value consists of a sequence of variable length, 8-byte aligned,
- * ATTR_LIST_ENTRY records.
- * - The list is not terminated by anything at all! The only way to know when
- * the end is reached is to keep track of the current offset and compare it to
- * the attribute value size.
- * - The attribute list attribute contains one entry for each attribute of
- * the file in which the list is located, except for the list attribute
- * itself. The list is sorted: first by attribute type, second by attribute
- * name (if present), third by instance number. The extents of one
- * non-resident attribute (if present) immediately follow after the initial
- * extent. They are ordered by lowest_vcn and have their instace set to zero.
- * It is not allowed to have two attributes with all sorting keys equal.
- * - Further restrictions:
- *	- If not resident, the vcn to lcn mapping array has to fit inside the
- *	  base mft record.
- *	- The attribute list attribute value has a maximum size of 256kb. This
- *	  is imposed by the Windows cache manager.
- * - Attribute lists are only used when the attributes of mft record do not
- * fit inside the mft record despite all attributes (that can be made
- * non-resident) having been made non-resident. This can happen e.g. when:
- *	- File has a large number of hard links (lots of file name
- *	  attributes present).
- *	- The mapping pairs array of some non-resident attribute becomes so
- *	  large due to fragmentation that it overflows the mft record.
- *	- The security descriptor is very complex (not applicable to
- *	  NTFS 3.0 volumes).
- *	- There are many named streams.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	ATTR_TYPE type;		/* Type of referenced attribute. */
-/*  4*/	le16 length;		/* Byte size of this entry (8-byte aligned). */
-/*  6*/	u8 name_length;		/* Size in Unicode chars of the name of the
-				   attribute or 0 if unnamed. */
-/*  7*/	u8 name_offset;		/* Byte offset to beginning of attribute name
-				   (always set this to where the name would
-				   start even if unnamed). */
-/*  8*/	leVCN lowest_vcn;	/* Lowest virtual cluster number of this portion
-				   of the attribute value. This is usually 0. It
-				   is non-zero for the case where one attribute
-				   does not fit into one mft record and thus
-				   several mft records are allocated to hold
-				   this attribute. In the latter case, each mft
-				   record holds one extent of the attribute and
-				   there is one attribute list entry for each
-				   extent. NOTE: This is DEFINITELY a signed
-				   value! The windows driver uses cmp, followed
-				   by jg when comparing this, thus it treats it
-				   as signed. */
-/* 16*/	leMFT_REF mft_reference;/* The reference of the mft record holding
-				   the ATTR_RECORD for this portion of the
-				   attribute value. */
-/* 24*/	le16 instance;		/* If lowest_vcn = 0, the instance of the
-				   attribute being referenced; otherwise 0. */
-/* 26*/	ntfschar name[0];	/* Use when creating only. When reading use
-				   name_offset to determine the location of the
-				   name. */
-/* sizeof() = 26 + (attribute_name_length * 2) bytes */
-} __attribute__ ((__packed__)) ATTR_LIST_ENTRY;
-
-/*
- * The maximum allowed length for a file name.
- */
-#define MAXIMUM_FILE_NAME_LENGTH	255
-
-/*
- * Possible namespaces for filenames in ntfs (8-bit).
- */
-enum {
-	FILE_NAME_POSIX		= 0x00,
-	/* This is the largest namespace. It is case sensitive and allows all
-	   Unicode characters except for: '\0' and '/'.  Beware that in
-	   WinNT/2k/2003 by default files which eg have the same name except
-	   for their case will not be distinguished by the standard utilities
-	   and thus a "del filename" will delete both "filename" and "fileName"
-	   without warning.  However if for example Services For Unix (SFU) are
-	   installed and the case sensitive option was enabled at installation
-	   time, then you can create/access/delete such files.
-	   Note that even SFU places restrictions on the filenames beyond the
-	   '\0' and '/' and in particular the following set of characters is
-	   not allowed: '"', '/', '<', '>', '\'.  All other characters,
-	   including the ones no allowed in WIN32 namespace are allowed.
-	   Tested with SFU 3.5 (this is now free) running on Windows XP. */
-	FILE_NAME_WIN32		= 0x01,
-	/* The standard WinNT/2k NTFS long filenames. Case insensitive.  All
-	   Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
-	   and '|'.  Further, names cannot end with a '.' or a space. */
-	FILE_NAME_DOS		= 0x02,
-	/* The standard DOS filenames (8.3 format). Uppercase only.  All 8-bit
-	   characters greater space, except: '"', '*', '+', ',', '/', ':', ';',
-	   '<', '=', '>', '?', and '\'. */
-	FILE_NAME_WIN32_AND_DOS	= 0x03,
-	/* 3 means that both the Win32 and the DOS filenames are identical and
-	   hence have been saved in this single filename record. */
-} __attribute__ ((__packed__));
-
-typedef u8 FILE_NAME_TYPE_FLAGS;
-
-/*
- * Attribute: Filename (0x30).
- *
- * NOTE: Always resident.
- * NOTE: All fields, except the parent_directory, are only updated when the
- *	 filename is changed. Until then, they just become out of sync with
- *	 reality and the more up to date values are present in the standard
- *	 information attribute.
- * NOTE: There is conflicting information about the meaning of each of the time
- *	 fields but the meaning as defined below has been verified to be
- *	 correct by practical experimentation on Windows NT4 SP6a and is hence
- *	 assumed to be the one and only correct interpretation.
- */
-typedef struct {
-/*hex ofs*/
-/*  0*/	leMFT_REF parent_directory;	/* Directory this filename is
-					   referenced from. */
-/*  8*/	sle64 creation_time;		/* Time file was created. */
-/* 10*/	sle64 last_data_change_time;	/* Time the data attribute was last
-					   modified. */
-/* 18*/	sle64 last_mft_change_time;	/* Time this mft record was last
-					   modified. */
-/* 20*/	sle64 last_access_time;		/* Time this mft record was last
-					   accessed. */
-/* 28*/	sle64 allocated_size;		/* Byte size of on-disk allocated space
-					   for the unnamed data attribute.  So
-					   for normal $DATA, this is the
-					   allocated_size from the unnamed
-					   $DATA attribute and for compressed
-					   and/or sparse $DATA, this is the
-					   compressed_size from the unnamed
-					   $DATA attribute.  For a directory or
-					   other inode without an unnamed $DATA
-					   attribute, this is always 0.  NOTE:
-					   This is a multiple of the cluster
-					   size. */
-/* 30*/	sle64 data_size;		/* Byte size of actual data in unnamed
-					   data attribute.  For a directory or
-					   other inode without an unnamed $DATA
-					   attribute, this is always 0. */
-/* 38*/	FILE_ATTR_FLAGS file_attributes;	/* Flags describing the file. */
-/* 3c*/	union {
-	/* 3c*/	struct {
-		/* 3c*/	le16 packed_ea_size;	/* Size of the buffer needed to
-						   pack the extended attributes
-						   (EAs), if such are present.*/
-		/* 3e*/	le16 reserved;		/* Reserved for alignment. */
-		} __attribute__ ((__packed__)) ea;
-	/* 3c*/	struct {
-		/* 3c*/	le32 reparse_point_tag;	/* Type of reparse point,
-						   present only in reparse
-						   points and only if there are
-						   no EAs. */
-		} __attribute__ ((__packed__)) rp;
-	} __attribute__ ((__packed__)) type;
-/* 40*/	u8 file_name_length;			/* Length of file name in
-						   (Unicode) characters. */
-/* 41*/	FILE_NAME_TYPE_FLAGS file_name_type;	/* Namespace of the file name.*/
-/* 42*/	ntfschar file_name[0];			/* File name in Unicode. */
-} __attribute__ ((__packed__)) FILE_NAME_ATTR;
-
-/*
- * GUID structures store globally unique identifiers (GUID). A GUID is a
- * 128-bit value consisting of one group of eight hexadecimal digits, followed
- * by three groups of four hexadecimal digits each, followed by one group of
- * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the
- * distributed computing environment (DCE) universally unique identifier (UUID).
- * Example of a GUID:
- *	1F010768-5A73-BC91-0010A52216A7
- */
-typedef struct {
-	le32 data1;	/* The first eight hexadecimal digits of the GUID. */
-	le16 data2;	/* The first group of four hexadecimal digits. */
-	le16 data3;	/* The second group of four hexadecimal digits. */
-	u8 data4[8];	/* The first two bytes are the third group of four
-			   hexadecimal digits. The remaining six bytes are the
-			   final 12 hexadecimal digits. */
-} __attribute__ ((__packed__)) GUID;
-
-/*
- * FILE_Extend/$ObjId contains an index named $O. This index contains all
- * object_ids present on the volume as the index keys and the corresponding
- * mft_record numbers as the index entry data parts. The data part (defined
- * below) also contains three other object_ids:
- *	birth_volume_id - object_id of FILE_Volume on which the file was first
- *			  created. Optional (i.e. can be zero).
- *	birth_object_id - object_id of file when it was first created. Usually
- *			  equals the object_id. Optional (i.e. can be zero).
- *	domain_id	- Reserved (always zero).
- */
-typedef struct {
-	leMFT_REF mft_reference;/* Mft record containing the object_id in
-				   the index entry key. */
-	union {
-		struct {
-			GUID birth_volume_id;
-			GUID birth_object_id;
-			GUID domain_id;
-		} __attribute__ ((__packed__)) origin;
-		u8 extended_info[48];
-	} __attribute__ ((__packed__)) opt;
-} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA;
-
-/*
- * Attribute: Object id (NTFS 3.0+) (0x40).
- *
- * NOTE: Always resident.
- */
-typedef struct {
-	GUID object_id;				/* Unique id assigned to the
-						   file.*/
-	/* The following fields are optional. The attribute value size is 16
-	   bytes, i.e. sizeof(GUID), if these are not present at all. Note,
-	   the entries can be present but one or more (or all) can be zero
-	   meaning that that particular value(s) is(are) not defined. */
-	union {
-		struct {
-			GUID birth_volume_id;	/* Unique id of volume on which
-						   the file was first created.*/
-			GUID birth_object_id;	/* Unique id of file when it was
-						   first created. */
-			GUID domain_id;		/* Reserved, zero. */
-		} __attribute__ ((__packed__)) origin;
-		u8 extended_info[48];
-	} __attribute__ ((__packed__)) opt;
-} __attribute__ ((__packed__)) OBJECT_ID_ATTR;
-
-/*
- * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in
- * the SID structure (see below).
- */
-//typedef enum {					/* SID string prefix. */
-//	SECURITY_NULL_SID_AUTHORITY	= {0, 0, 0, 0, 0, 0},	/* S-1-0 */
-//	SECURITY_WORLD_SID_AUTHORITY	= {0, 0, 0, 0, 0, 1},	/* S-1-1 */
-//	SECURITY_LOCAL_SID_AUTHORITY	= {0, 0, 0, 0, 0, 2},	/* S-1-2 */
-//	SECURITY_CREATOR_SID_AUTHORITY	= {0, 0, 0, 0, 0, 3},	/* S-1-3 */
-//	SECURITY_NON_UNIQUE_AUTHORITY	= {0, 0, 0, 0, 0, 4},	/* S-1-4 */
-//	SECURITY_NT_SID_AUTHORITY	= {0, 0, 0, 0, 0, 5},	/* S-1-5 */
-//} IDENTIFIER_AUTHORITIES;
-
-/*
- * These relative identifiers (RIDs) are used with the above identifier
- * authorities to make up universal well-known SIDs.
- *
- * Note: The relative identifier (RID) refers to the portion of a SID, which
- * identifies a user or group in relation to the authority that issued the SID.
- * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is
- * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and
- * the relative identifier SECURITY_CREATOR_OWNER_RID (0).
- */
-typedef enum {					/* Identifier authority. */
-	SECURITY_NULL_RID		  = 0,	/* S-1-0 */
-	SECURITY_WORLD_RID		  = 0,	/* S-1-1 */
-	SECURITY_LOCAL_RID		  = 0,	/* S-1-2 */
-
-	SECURITY_CREATOR_OWNER_RID	  = 0,	/* S-1-3 */
-	SECURITY_CREATOR_GROUP_RID	  = 1,	/* S-1-3 */
-
-	SECURITY_CREATOR_OWNER_SERVER_RID = 2,	/* S-1-3 */
-	SECURITY_CREATOR_GROUP_SERVER_RID = 3,	/* S-1-3 */
-
-	SECURITY_DIALUP_RID		  = 1,
-	SECURITY_NETWORK_RID		  = 2,
-	SECURITY_BATCH_RID		  = 3,
-	SECURITY_INTERACTIVE_RID	  = 4,
-	SECURITY_SERVICE_RID		  = 6,
-	SECURITY_ANONYMOUS_LOGON_RID	  = 7,
-	SECURITY_PROXY_RID		  = 8,
-	SECURITY_ENTERPRISE_CONTROLLERS_RID=9,
-	SECURITY_SERVER_LOGON_RID	  = 9,
-	SECURITY_PRINCIPAL_SELF_RID	  = 0xa,
-	SECURITY_AUTHENTICATED_USER_RID	  = 0xb,
-	SECURITY_RESTRICTED_CODE_RID	  = 0xc,
-	SECURITY_TERMINAL_SERVER_RID	  = 0xd,
-
-	SECURITY_LOGON_IDS_RID		  = 5,
-	SECURITY_LOGON_IDS_RID_COUNT	  = 3,
-
-	SECURITY_LOCAL_SYSTEM_RID	  = 0x12,
-
-	SECURITY_NT_NON_UNIQUE		  = 0x15,
-
-	SECURITY_BUILTIN_DOMAIN_RID	  = 0x20,
-
-	/*
-	 * Well-known domain relative sub-authority values (RIDs).
-	 */
-
-	/* Users. */
-	DOMAIN_USER_RID_ADMIN		  = 0x1f4,
-	DOMAIN_USER_RID_GUEST		  = 0x1f5,
-	DOMAIN_USER_RID_KRBTGT		  = 0x1f6,
-
-	/* Groups. */
-	DOMAIN_GROUP_RID_ADMINS		  = 0x200,
-	DOMAIN_GROUP_RID_USERS		  = 0x201,
-	DOMAIN_GROUP_RID_GUESTS		  = 0x202,
-	DOMAIN_GROUP_RID_COMPUTERS	  = 0x203,
-	DOMAIN_GROUP_RID_CONTROLLERS	  = 0x204,
-	DOMAIN_GROUP_RID_CERT_ADMINS	  = 0x205,
-	DOMAIN_GROUP_RID_SCHEMA_ADMINS	  = 0x206,
-	DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207,
-	DOMAIN_GROUP_RID_POLICY_ADMINS	  = 0x208,
-
-	/* Aliases. */
-	DOMAIN_ALIAS_RID_ADMINS		  = 0x220,
-	DOMAIN_ALIAS_RID_USERS		  = 0x221,
-	DOMAIN_ALIAS_RID_GUESTS		  = 0x222,
-	DOMAIN_ALIAS_RID_POWER_USERS	  = 0x223,
-
-	DOMAIN_ALIAS_RID_ACCOUNT_OPS	  = 0x224,
-	DOMAIN_ALIAS_RID_SYSTEM_OPS	  = 0x225,
-	DOMAIN_ALIAS_RID_PRINT_OPS	  = 0x226,
-	DOMAIN_ALIAS_RID_BACKUP_OPS	  = 0x227,
-
-	DOMAIN_ALIAS_RID_REPLICATOR	  = 0x228,
-	DOMAIN_ALIAS_RID_RAS_SERVERS	  = 0x229,
-	DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a,
-} RELATIVE_IDENTIFIERS;
-
-/*
- * The universal well-known SIDs:
- *
- *	NULL_SID			S-1-0-0
- *	WORLD_SID			S-1-1-0
- *	LOCAL_SID			S-1-2-0
- *	CREATOR_OWNER_SID		S-1-3-0
- *	CREATOR_GROUP_SID		S-1-3-1
- *	CREATOR_OWNER_SERVER_SID	S-1-3-2
- *	CREATOR_GROUP_SERVER_SID	S-1-3-3
- *
- *	(Non-unique IDs)		S-1-4
- *
- * NT well-known SIDs:
- *
- *	NT_AUTHORITY_SID	S-1-5
- *	DIALUP_SID		S-1-5-1
- *
- *	NETWORD_SID		S-1-5-2
- *	BATCH_SID		S-1-5-3
- *	INTERACTIVE_SID		S-1-5-4
- *	SERVICE_SID		S-1-5-6
- *	ANONYMOUS_LOGON_SID	S-1-5-7		(aka null logon session)
- *	PROXY_SID		S-1-5-8
- *	SERVER_LOGON_SID	S-1-5-9		(aka domain controller account)
- *	SELF_SID		S-1-5-10	(self RID)
- *	AUTHENTICATED_USER_SID	S-1-5-11
- *	RESTRICTED_CODE_SID	S-1-5-12	(running restricted code)
- *	TERMINAL_SERVER_SID	S-1-5-13	(running on terminal server)
- *
- *	(Logon IDs)		S-1-5-5-X-Y
- *
- *	(NT non-unique IDs)	S-1-5-0x15-...
- *
- *	(Built-in domain)	S-1-5-0x20
- */
-
-/*
- * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure.
- *
- * NOTE: This is stored as a big endian number, hence the high_part comes
- * before the low_part.
- */
-typedef union {
-	struct {
-		u16 high_part;	/* High 16-bits. */
-		u32 low_part;	/* Low 32-bits. */
-	} __attribute__ ((__packed__)) parts;
-	u8 value[6];		/* Value as individual bytes. */
-} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY;
-
-/*
- * The SID structure is a variable-length structure used to uniquely identify
- * users or groups. SID stands for security identifier.
- *
- * The standard textual representation of the SID is of the form:
- *	S-R-I-S-S...
- * Where:
- *    - The first "S" is the literal character 'S' identifying the following
- *	digits as a SID.
- *    - R is the revision level of the SID expressed as a sequence of digits
- *	either in decimal or hexadecimal (if the later, prefixed by "0x").
- *    - I is the 48-bit identifier_authority, expressed as digits as R above.
- *    - S... is one or more sub_authority values, expressed as digits as above.
- *
- * Example SID; the domain-relative SID of the local Administrators group on
- * Windows NT/2k:
- *	S-1-5-32-544
- * This translates to a SID with:
- *	revision = 1,
- *	sub_authority_count = 2,
- *	identifier_authority = {0,0,0,0,0,5},	// SECURITY_NT_AUTHORITY
- *	sub_authority[0] = 32,			// SECURITY_BUILTIN_DOMAIN_RID
- *	sub_authority[1] = 544			// DOMAIN_ALIAS_RID_ADMINS
- */
-typedef struct {
-	u8 revision;
-	u8 sub_authority_count;
-	SID_IDENTIFIER_AUTHORITY identifier_authority;
-	le32 sub_authority[1];		/* At least one sub_authority. */
-} __attribute__ ((__packed__)) SID;
-
-/*
- * Current constants for SIDs.
- */
-typedef enum {
-	SID_REVISION			=  1,	/* Current revision level. */
-	SID_MAX_SUB_AUTHORITIES		= 15,	/* Maximum number of those. */
-	SID_RECOMMENDED_SUB_AUTHORITIES	=  1,	/* Will change to around 6 in
-						   a future revision. */
-} SID_CONSTANTS;
-
-/*
- * The predefined ACE types (8-bit, see below).
- */
-enum {
-	ACCESS_MIN_MS_ACE_TYPE		= 0,
-	ACCESS_ALLOWED_ACE_TYPE		= 0,
-	ACCESS_DENIED_ACE_TYPE		= 1,
-	SYSTEM_AUDIT_ACE_TYPE		= 2,
-	SYSTEM_ALARM_ACE_TYPE		= 3, /* Not implemented as of Win2k. */
-	ACCESS_MAX_MS_V2_ACE_TYPE	= 3,
-
-	ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4,
-	ACCESS_MAX_MS_V3_ACE_TYPE	= 4,
-
-	/* The following are Win2k only. */
-	ACCESS_MIN_MS_OBJECT_ACE_TYPE	= 5,
-	ACCESS_ALLOWED_OBJECT_ACE_TYPE	= 5,
-	ACCESS_DENIED_OBJECT_ACE_TYPE	= 6,
-	SYSTEM_AUDIT_OBJECT_ACE_TYPE	= 7,
-	SYSTEM_ALARM_OBJECT_ACE_TYPE	= 8,
-	ACCESS_MAX_MS_OBJECT_ACE_TYPE	= 8,
-
-	ACCESS_MAX_MS_V4_ACE_TYPE	= 8,
-
-	/* This one is for WinNT/2k. */
-	ACCESS_MAX_MS_ACE_TYPE		= 8,
-} __attribute__ ((__packed__));
-
-typedef u8 ACE_TYPES;
-
-/*
- * The ACE flags (8-bit) for audit and inheritance (see below).
- *
- * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE
- * types to indicate that a message is generated (in Windows!) for successful
- * accesses.
- *
- * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types
- * to indicate that a message is generated (in Windows!) for failed accesses.
- */
-enum {
-	/* The inheritance flags. */
-	OBJECT_INHERIT_ACE		= 0x01,
-	CONTAINER_INHERIT_ACE		= 0x02,
-	NO_PROPAGATE_INHERIT_ACE	= 0x04,
-	INHERIT_ONLY_ACE		= 0x08,
-	INHERITED_ACE			= 0x10,	/* Win2k only. */
-	VALID_INHERIT_FLAGS		= 0x1f,
-
-	/* The audit flags. */
-	SUCCESSFUL_ACCESS_ACE_FLAG	= 0x40,
-	FAILED_ACCESS_ACE_FLAG		= 0x80,
-} __attribute__ ((__packed__));
-
-typedef u8 ACE_FLAGS;
-
-/*
- * An ACE is an access-control entry in an access-control list (ACL).
- * An ACE defines access to an object for a specific user or group or defines
- * the types of access that generate system-administration messages or alarms
- * for a specific user or group. The user or group is identified by a security
- * identifier (SID).
- *
- * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary),
- * which specifies the type and size of the ACE. The format of the subsequent
- * data depends on the ACE type.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	ACE_TYPES type;		/* Type of the ACE. */
-/*  1*/	ACE_FLAGS flags;	/* Flags describing the ACE. */
-/*  2*/	le16 size;		/* Size in bytes of the ACE. */
-} __attribute__ ((__packed__)) ACE_HEADER;
-
-/*
- * The access mask (32-bit). Defines the access rights.
- *
- * The specific rights (bits 0 to 15).  These depend on the type of the object
- * being secured by the ACE.
- */
-enum {
-	/* Specific rights for files and directories are as follows: */
-
-	/* Right to read data from the file. (FILE) */
-	FILE_READ_DATA			= cpu_to_le32(0x00000001),
-	/* Right to list contents of a directory. (DIRECTORY) */
-	FILE_LIST_DIRECTORY		= cpu_to_le32(0x00000001),
-
-	/* Right to write data to the file. (FILE) */
-	FILE_WRITE_DATA			= cpu_to_le32(0x00000002),
-	/* Right to create a file in the directory. (DIRECTORY) */
-	FILE_ADD_FILE			= cpu_to_le32(0x00000002),
-
-	/* Right to append data to the file. (FILE) */
-	FILE_APPEND_DATA		= cpu_to_le32(0x00000004),
-	/* Right to create a subdirectory. (DIRECTORY) */
-	FILE_ADD_SUBDIRECTORY		= cpu_to_le32(0x00000004),
-
-	/* Right to read extended attributes. (FILE/DIRECTORY) */
-	FILE_READ_EA			= cpu_to_le32(0x00000008),
-
-	/* Right to write extended attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_EA			= cpu_to_le32(0x00000010),
-
-	/* Right to execute a file. (FILE) */
-	FILE_EXECUTE			= cpu_to_le32(0x00000020),
-	/* Right to traverse the directory. (DIRECTORY) */
-	FILE_TRAVERSE			= cpu_to_le32(0x00000020),
-
-	/*
-	 * Right to delete a directory and all the files it contains (its
-	 * children), even if the files are read-only. (DIRECTORY)
-	 */
-	FILE_DELETE_CHILD		= cpu_to_le32(0x00000040),
-
-	/* Right to read file attributes. (FILE/DIRECTORY) */
-	FILE_READ_ATTRIBUTES		= cpu_to_le32(0x00000080),
-
-	/* Right to change file attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_ATTRIBUTES		= cpu_to_le32(0x00000100),
-
-	/*
-	 * The standard rights (bits 16 to 23).  These are independent of the
-	 * type of object being secured.
-	 */
-
-	/* Right to delete the object. */
-	DELETE				= cpu_to_le32(0x00010000),
-
-	/*
-	 * Right to read the information in the object's security descriptor,
-	 * not including the information in the SACL, i.e. right to read the
-	 * security descriptor and owner.
-	 */
-	READ_CONTROL			= cpu_to_le32(0x00020000),
-
-	/* Right to modify the DACL in the object's security descriptor. */
-	WRITE_DAC			= cpu_to_le32(0x00040000),
-
-	/* Right to change the owner in the object's security descriptor. */
-	WRITE_OWNER			= cpu_to_le32(0x00080000),
-
-	/*
-	 * Right to use the object for synchronization.  Enables a process to
-	 * wait until the object is in the signalled state.  Some object types
-	 * do not support this access right.
-	 */
-	SYNCHRONIZE			= cpu_to_le32(0x00100000),
-
-	/*
-	 * The following STANDARD_RIGHTS_* are combinations of the above for
-	 * convenience and are defined by the Win32 API.
-	 */
-
-	/* These are currently defined to READ_CONTROL. */
-	STANDARD_RIGHTS_READ		= cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_WRITE		= cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_EXECUTE		= cpu_to_le32(0x00020000),
-
-	/* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
-	STANDARD_RIGHTS_REQUIRED	= cpu_to_le32(0x000f0000),
-
-	/*
-	 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
-	 * SYNCHRONIZE access.
-	 */
-	STANDARD_RIGHTS_ALL		= cpu_to_le32(0x001f0000),
-
-	/*
-	 * The access system ACL and maximum allowed access types (bits 24 to
-	 * 25, bits 26 to 27 are reserved).
-	 */
-	ACCESS_SYSTEM_SECURITY		= cpu_to_le32(0x01000000),
-	MAXIMUM_ALLOWED			= cpu_to_le32(0x02000000),
-
-	/*
-	 * The generic rights (bits 28 to 31).  These map onto the standard and
-	 * specific rights.
-	 */
-
-	/* Read, write, and execute access. */
-	GENERIC_ALL			= cpu_to_le32(0x10000000),
-
-	/* Execute access. */
-	GENERIC_EXECUTE			= cpu_to_le32(0x20000000),
-
-	/*
-	 * Write access.  For files, this maps onto:
-	 *	FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA |
-	 *	FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE
-	 * For directories, the mapping has the same numerical value.  See
-	 * above for the descriptions of the rights granted.
-	 */
-	GENERIC_WRITE			= cpu_to_le32(0x40000000),
-
-	/*
-	 * Read access.  For files, this maps onto:
-	 *	FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA |
-	 *	STANDARD_RIGHTS_READ | SYNCHRONIZE
-	 * For directories, the mapping has the same numberical value.  See
-	 * above for the descriptions of the rights granted.
-	 */
-	GENERIC_READ			= cpu_to_le32(0x80000000),
-};
-
-typedef le32 ACCESS_MASK;
-
-/*
- * The generic mapping array. Used to denote the mapping of each generic
- * access right to a specific access mask.
- *
- * FIXME: What exactly is this and what is it for? (AIA)
- */
-typedef struct {
-	ACCESS_MASK generic_read;
-	ACCESS_MASK generic_write;
-	ACCESS_MASK generic_execute;
-	ACCESS_MASK generic_all;
-} __attribute__ ((__packed__)) GENERIC_MAPPING;
-
-/*
- * The predefined ACE type structures are as defined below.
- */
-
-/*
- * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE
- */
-typedef struct {
-/*  0	ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
-	ACE_TYPES type;		/* Type of the ACE. */
-	ACE_FLAGS flags;	/* Flags describing the ACE. */
-	le16 size;		/* Size in bytes of the ACE. */
-/*  4*/	ACCESS_MASK mask;	/* Access mask associated with the ACE. */
-
-/*  8*/	SID sid;		/* The SID associated with the ACE. */
-} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE,
-			       SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE;
-
-/*
- * The object ACE flags (32-bit).
- */
-enum {
-	ACE_OBJECT_TYPE_PRESENT			= cpu_to_le32(1),
-	ACE_INHERITED_OBJECT_TYPE_PRESENT	= cpu_to_le32(2),
-};
-
-typedef le32 OBJECT_ACE_FLAGS;
-
-typedef struct {
-/*  0	ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
-	ACE_TYPES type;		/* Type of the ACE. */
-	ACE_FLAGS flags;	/* Flags describing the ACE. */
-	le16 size;		/* Size in bytes of the ACE. */
-/*  4*/	ACCESS_MASK mask;	/* Access mask associated with the ACE. */
-
-/*  8*/	OBJECT_ACE_FLAGS object_flags;	/* Flags describing the object ACE. */
-/* 12*/	GUID object_type;
-/* 28*/	GUID inherited_object_type;
-
-/* 44*/	SID sid;		/* The SID associated with the ACE. */
-} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE,
-			       ACCESS_DENIED_OBJECT_ACE,
-			       SYSTEM_AUDIT_OBJECT_ACE,
-			       SYSTEM_ALARM_OBJECT_ACE;
-
-/*
- * An ACL is an access-control list (ACL).
- * An ACL starts with an ACL header structure, which specifies the size of
- * the ACL and the number of ACEs it contains. The ACL header is followed by
- * zero or more access control entries (ACEs). The ACL as well as each ACE
- * are aligned on 4-byte boundaries.
- */
-typedef struct {
-	u8 revision;	/* Revision of this ACL. */
-	u8 alignment1;
-	le16 size;	/* Allocated space in bytes for ACL. Includes this
-			   header, the ACEs and the remaining free space. */
-	le16 ace_count;	/* Number of ACEs in the ACL. */
-	le16 alignment2;
-/* sizeof() = 8 bytes */
-} __attribute__ ((__packed__)) ACL;
-
-/*
- * Current constants for ACLs.
- */
-typedef enum {
-	/* Current revision. */
-	ACL_REVISION		= 2,
-	ACL_REVISION_DS		= 4,
-
-	/* History of revisions. */
-	ACL_REVISION1		= 1,
-	MIN_ACL_REVISION	= 2,
-	ACL_REVISION2		= 2,
-	ACL_REVISION3		= 3,
-	ACL_REVISION4		= 4,
-	MAX_ACL_REVISION	= 4,
-} ACL_CONSTANTS;
-
-/*
- * The security descriptor control flags (16-bit).
- *
- * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID
- *	pointed to by the Owner field was provided by a defaulting mechanism
- *	rather than explicitly provided by the original provider of the
- *	security descriptor.  This may affect the treatment of the SID with
- *	respect to inheritance of an owner.
- *
- * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
- *	the Group field was provided by a defaulting mechanism rather than
- *	explicitly provided by the original provider of the security
- *	descriptor.  This may affect the treatment of the SID with respect to
- *	inheritance of a primary group.
- *
- * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
- *	descriptor contains a discretionary ACL.  If this flag is set and the
- *	Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is
- *	explicitly being specified.
- *
- * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
- *	pointed to by the Dacl field was provided by a defaulting mechanism
- *	rather than explicitly provided by the original provider of the
- *	security descriptor.  This may affect the treatment of the ACL with
- *	respect to inheritance of an ACL.  This flag is ignored if the
- *	DaclPresent flag is not set.
- *
- * SE_SACL_PRESENT - This boolean flag, when set,  indicates that the security
- *	descriptor contains a system ACL pointed to by the Sacl field.  If this
- *	flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then
- *	an empty (but present) ACL is being specified.
- *
- * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
- *	pointed to by the Sacl field was provided by a defaulting mechanism
- *	rather than explicitly provided by the original provider of the
- *	security descriptor.  This may affect the treatment of the ACL with
- *	respect to inheritance of an ACL.  This flag is ignored if the
- *	SaclPresent flag is not set.
- *
- * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
- *	descriptor is in self-relative form.  In this form, all fields of the
- *	security descriptor are contiguous in memory and all pointer fields are
- *	expressed as offsets from the beginning of the security descriptor.
- */
-enum {
-	SE_OWNER_DEFAULTED		= cpu_to_le16(0x0001),
-	SE_GROUP_DEFAULTED		= cpu_to_le16(0x0002),
-	SE_DACL_PRESENT			= cpu_to_le16(0x0004),
-	SE_DACL_DEFAULTED		= cpu_to_le16(0x0008),
-
-	SE_SACL_PRESENT			= cpu_to_le16(0x0010),
-	SE_SACL_DEFAULTED		= cpu_to_le16(0x0020),
-
-	SE_DACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0100),
-	SE_SACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0200),
-	SE_DACL_AUTO_INHERITED		= cpu_to_le16(0x0400),
-	SE_SACL_AUTO_INHERITED		= cpu_to_le16(0x0800),
-
-	SE_DACL_PROTECTED		= cpu_to_le16(0x1000),
-	SE_SACL_PROTECTED		= cpu_to_le16(0x2000),
-	SE_RM_CONTROL_VALID		= cpu_to_le16(0x4000),
-	SE_SELF_RELATIVE		= cpu_to_le16(0x8000)
-} __attribute__ ((__packed__));
-
-typedef le16 SECURITY_DESCRIPTOR_CONTROL;
-
-/*
- * Self-relative security descriptor. Contains the owner and group SIDs as well
- * as the sacl and dacl ACLs inside the security descriptor itself.
- */
-typedef struct {
-	u8 revision;	/* Revision level of the security descriptor. */
-	u8 alignment;
-	SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
-			   the descriptor as well as the following fields. */
-	le32 owner;	/* Byte offset to a SID representing an object's
-			   owner. If this is NULL, no owner SID is present in
-			   the descriptor. */
-	le32 group;	/* Byte offset to a SID representing an object's
-			   primary group. If this is NULL, no primary group
-			   SID is present in the descriptor. */
-	le32 sacl;	/* Byte offset to a system ACL. Only valid, if
-			   SE_SACL_PRESENT is set in the control field. If
-			   SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
-			   is specified. */
-	le32 dacl;	/* Byte offset to a discretionary ACL. Only valid, if
-			   SE_DACL_PRESENT is set in the control field. If
-			   SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
-			   (unconditionally granting access) is specified. */
-/* sizeof() = 0x14 bytes */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE;
-
-/*
- * Absolute security descriptor. Does not contain the owner and group SIDs, nor
- * the sacl and dacl ACLs inside the security descriptor. Instead, it contains
- * pointers to these structures in memory. Obviously, absolute security
- * descriptors are only useful for in memory representations of security
- * descriptors. On disk, a self-relative security descriptor is used.
- */
-typedef struct {
-	u8 revision;	/* Revision level of the security descriptor. */
-	u8 alignment;
-	SECURITY_DESCRIPTOR_CONTROL control;	/* Flags qualifying the type of
-			   the descriptor as well as the following fields. */
-	SID *owner;	/* Points to a SID representing an object's owner. If
-			   this is NULL, no owner SID is present in the
-			   descriptor. */
-	SID *group;	/* Points to a SID representing an object's primary
-			   group. If this is NULL, no primary group SID is
-			   present in the descriptor. */
-	ACL *sacl;	/* Points to a system ACL. Only valid, if
-			   SE_SACL_PRESENT is set in the control field. If
-			   SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
-			   is specified. */
-	ACL *dacl;	/* Points to a discretionary ACL. Only valid, if
-			   SE_DACL_PRESENT is set in the control field. If
-			   SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
-			   (unconditionally granting access) is specified. */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR;
-
-/*
- * Current constants for security descriptors.
- */
-typedef enum {
-	/* Current revision. */
-	SECURITY_DESCRIPTOR_REVISION	= 1,
-	SECURITY_DESCRIPTOR_REVISION1	= 1,
-
-	/* The sizes of both the absolute and relative security descriptors is
-	   the same as pointers, at least on ia32 architecture are 32-bit. */
-	SECURITY_DESCRIPTOR_MIN_LENGTH	= sizeof(SECURITY_DESCRIPTOR),
-} SECURITY_DESCRIPTOR_CONSTANTS;
-
-/*
- * Attribute: Security descriptor (0x50). A standard self-relative security
- * descriptor.
- *
- * NOTE: Can be resident or non-resident.
- * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally
- * in FILE_Secure and the correct descriptor is found using the security_id
- * from the standard information attribute.
- */
-typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR;
-
-/*
- * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one
- * referenced instance of each unique security descriptor is stored.
- *
- * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It
- * does, however, contain two indexes ($SDH and $SII) as well as a named data
- * stream ($SDS).
- *
- * Every unique security descriptor is assigned a unique security identifier
- * (security_id, not to be confused with a SID). The security_id is unique for
- * the NTFS volume and is used as an index into the $SII index, which maps
- * security_ids to the security descriptor's storage location within the $SDS
- * data attribute. The $SII index is sorted by ascending security_id.
- *
- * A simple hash is computed from each security descriptor. This hash is used
- * as an index into the $SDH index, which maps security descriptor hashes to
- * the security descriptor's storage location within the $SDS data attribute.
- * The $SDH index is sorted by security descriptor hash and is stored in a B+
- * tree. When searching $SDH (with the intent of determining whether or not a
- * new security descriptor is already present in the $SDS data stream), if a
- * matching hash is found, but the security descriptors do not match, the
- * search in the $SDH index is continued, searching for a next matching hash.
- *
- * When a precise match is found, the security_id coresponding to the security
- * descriptor in the $SDS attribute is read from the found $SDH index entry and
- * is stored in the $STANDARD_INFORMATION attribute of the file/directory to
- * which the security descriptor is being applied. The $STANDARD_INFORMATION
- * attribute is present in all base mft records (i.e. in all files and
- * directories).
- *
- * If a match is not found, the security descriptor is assigned a new unique
- * security_id and is added to the $SDS data attribute. Then, entries
- * referencing the this security descriptor in the $SDS data attribute are
- * added to the $SDH and $SII indexes.
- *
- * Note: Entries are never deleted from FILE_Secure, even if nothing
- * references an entry any more.
- */
-
-/*
- * This header precedes each security descriptor in the $SDS data stream.
- * This is also the index entry data part of both the $SII and $SDH indexes.
- */
-typedef struct {
-	le32 hash;	  /* Hash of the security descriptor. */
-	le32 security_id; /* The security_id assigned to the descriptor. */
-	le64 offset;	  /* Byte offset of this entry in the $SDS stream. */
-	le32 length;	  /* Size in bytes of this entry in $SDS stream. */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER;
-
-/*
- * The $SDS data stream contains the security descriptors, aligned on 16-byte
- * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot
- * cross 256kib boundaries (this restriction is imposed by the Windows cache
- * manager). Each security descriptor is contained in a SDS_ENTRY structure.
- * Also, each security descriptor is stored twice in the $SDS stream with a
- * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
- * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
- * first copy of the security descriptor will be at offset 0x51d0 in the
- * $SDS data stream and the second copy will be at offset 0x451d0.
- */
-typedef struct {
-/*Ofs*/
-/*  0	SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like
-				       unnamed structs. */
-	le32 hash;	  /* Hash of the security descriptor. */
-	le32 security_id; /* The security_id assigned to the descriptor. */
-	le64 offset;	  /* Byte offset of this entry in the $SDS stream. */
-	le32 length;	  /* Size in bytes of this entry in $SDS stream. */
-/* 20*/	SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security
-					     descriptor. */
-} __attribute__ ((__packed__)) SDS_ENTRY;
-
-/*
- * The index entry key used in the $SII index. The collation type is
- * COLLATION_NTOFS_ULONG.
- */
-typedef struct {
-	le32 security_id; /* The security_id assigned to the descriptor. */
-} __attribute__ ((__packed__)) SII_INDEX_KEY;
-
-/*
- * The index entry key used in the $SDH index. The keys are sorted first by
- * hash and then by security_id. The collation rule is
- * COLLATION_NTOFS_SECURITY_HASH.
- */
-typedef struct {
-	le32 hash;	  /* Hash of the security descriptor. */
-	le32 security_id; /* The security_id assigned to the descriptor. */
-} __attribute__ ((__packed__)) SDH_INDEX_KEY;
-
-/*
- * Attribute: Volume name (0x60).
- *
- * NOTE: Always resident.
- * NOTE: Present only in FILE_Volume.
- */
-typedef struct {
-	ntfschar name[0];	/* The name of the volume in Unicode. */
-} __attribute__ ((__packed__)) VOLUME_NAME;
-
-/*
- * Possible flags for the volume (16-bit).
- */
-enum {
-	VOLUME_IS_DIRTY			= cpu_to_le16(0x0001),
-	VOLUME_RESIZE_LOG_FILE		= cpu_to_le16(0x0002),
-	VOLUME_UPGRADE_ON_MOUNT		= cpu_to_le16(0x0004),
-	VOLUME_MOUNTED_ON_NT4		= cpu_to_le16(0x0008),
-
-	VOLUME_DELETE_USN_UNDERWAY	= cpu_to_le16(0x0010),
-	VOLUME_REPAIR_OBJECT_ID		= cpu_to_le16(0x0020),
-
-	VOLUME_CHKDSK_UNDERWAY		= cpu_to_le16(0x4000),
-	VOLUME_MODIFIED_BY_CHKDSK	= cpu_to_le16(0x8000),
-
-	VOLUME_FLAGS_MASK		= cpu_to_le16(0xc03f),
-
-	/* To make our life easier when checking if we must mount read-only. */
-	VOLUME_MUST_MOUNT_RO_MASK	= cpu_to_le16(0xc027),
-} __attribute__ ((__packed__));
-
-typedef le16 VOLUME_FLAGS;
-
-/*
- * Attribute: Volume information (0x70).
- *
- * NOTE: Always resident.
- * NOTE: Present only in FILE_Volume.
- * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses
- *	 NTFS 1.2. I haven't personally seen other values yet.
- */
-typedef struct {
-	le64 reserved;		/* Not used (yet?). */
-	u8 major_ver;		/* Major version of the ntfs format. */
-	u8 minor_ver;		/* Minor version of the ntfs format. */
-	VOLUME_FLAGS flags;	/* Bit array of VOLUME_* flags. */
-} __attribute__ ((__packed__)) VOLUME_INFORMATION;
-
-/*
- * Attribute: Data attribute (0x80).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Data contents of a file (i.e. the unnamed stream) or of a named stream.
- */
-typedef struct {
-	u8 data[0];		/* The file's data contents. */
-} __attribute__ ((__packed__)) DATA_ATTR;
-
-/*
- * Index header flags (8-bit).
- */
-enum {
-	/*
-	 * When index header is in an index root attribute:
-	 */
-	SMALL_INDEX = 0, /* The index is small enough to fit inside the index
-			    root attribute and there is no index allocation
-			    attribute present. */
-	LARGE_INDEX = 1, /* The index is too large to fit in the index root
-			    attribute and/or an index allocation attribute is
-			    present. */
-	/*
-	 * When index header is in an index block, i.e. is part of index
-	 * allocation attribute:
-	 */
-	LEAF_NODE  = 0, /* This is a leaf node, i.e. there are no more nodes
-			   branching off it. */
-	INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf
-			   node. */
-	NODE_MASK  = 1, /* Mask for accessing the *_NODE bits. */
-} __attribute__ ((__packed__));
-
-typedef u8 INDEX_HEADER_FLAGS;
-
-/*
- * This is the header for indexes, describing the INDEX_ENTRY records, which
- * follow the INDEX_HEADER. Together the index header and the index entries
- * make up a complete index.
- *
- * IMPORTANT NOTE: The offset, length and size structure members are counted
- * relative to the start of the index header structure and not relative to the
- * start of the index root or index allocation structures themselves.
- */
-typedef struct {
-	le32 entries_offset;		/* Byte offset to first INDEX_ENTRY
-					   aligned to 8-byte boundary. */
-	le32 index_length;		/* Data size of the index in bytes,
-					   i.e. bytes used from allocated
-					   size, aligned to 8-byte boundary. */
-	le32 allocated_size;		/* Byte size of this index (block),
-					   multiple of 8 bytes. */
-	/* NOTE: For the index root attribute, the above two numbers are always
-	   equal, as the attribute is resident and it is resized as needed. In
-	   the case of the index allocation attribute the attribute is not
-	   resident and hence the allocated_size is a fixed value and must
-	   equal the index_block_size specified by the INDEX_ROOT attribute
-	   corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK
-	   belongs to. */
-	INDEX_HEADER_FLAGS flags;	/* Bit field of INDEX_HEADER_FLAGS. */
-	u8 reserved[3];			/* Reserved/align to 8-byte boundary. */
-} __attribute__ ((__packed__)) INDEX_HEADER;
-
-/*
- * Attribute: Index root (0x90).
- *
- * NOTE: Always resident.
- *
- * This is followed by a sequence of index entries (INDEX_ENTRY structures)
- * as described by the index header.
- *
- * When a directory is small enough to fit inside the index root then this
- * is the only attribute describing the directory. When the directory is too
- * large to fit in the index root, on the other hand, two additional attributes
- * are present: an index allocation attribute, containing sub-nodes of the B+
- * directory tree (see below), and a bitmap attribute, describing which virtual
- * cluster numbers (vcns) in the index allocation attribute are in use by an
- * index block.
- *
- * NOTE: The root directory (FILE_root) contains an entry for itself. Other
- * directories do not contain entries for themselves, though.
- */
-typedef struct {
-	ATTR_TYPE type;			/* Type of the indexed attribute. Is
-					   $FILE_NAME for directories, zero
-					   for view indexes. No other values
-					   allowed. */
-	COLLATION_RULE collation_rule;	/* Collation rule used to sort the
-					   index entries. If type is $FILE_NAME,
-					   this must be COLLATION_FILE_NAME. */
-	le32 index_block_size;		/* Size of each index block in bytes (in
-					   the index allocation attribute). */
-	u8 clusters_per_index_block;	/* Cluster size of each index block (in
-					   the index allocation attribute), when
-					   an index block is >= than a cluster,
-					   otherwise this will be the log of
-					   the size (like how the encoding of
-					   the mft record size and the index
-					   record size found in the boot sector
-					   work). Has to be a power of 2. */
-	u8 reserved[3];			/* Reserved/align to 8-byte boundary. */
-	INDEX_HEADER index;		/* Index header describing the
-					   following index entries. */
-} __attribute__ ((__packed__)) INDEX_ROOT;
-
-/*
- * Attribute: Index allocation (0xa0).
- *
- * NOTE: Always non-resident (doesn't make sense to be resident anyway!).
- *
- * This is an array of index blocks. Each index block starts with an
- * INDEX_BLOCK structure containing an index header, followed by a sequence of
- * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER.
- */
-typedef struct {
-/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-	NTFS_RECORD_TYPE magic;	/* Magic is "INDX". */
-	le16 usa_ofs;		/* See NTFS_RECORD definition. */
-	le16 usa_count;		/* See NTFS_RECORD definition. */
-
-/*  8*/	sle64 lsn;		/* $LogFile sequence number of the last
-				   modification of this index block. */
-/* 16*/	leVCN index_block_vcn;	/* Virtual cluster number of the index block.
-				   If the cluster_size on the volume is <= the
-				   index_block_size of the directory,
-				   index_block_vcn counts in units of clusters,
-				   and in units of sectors otherwise. */
-/* 24*/	INDEX_HEADER index;	/* Describes the following index entries. */
-/* sizeof()= 40 (0x28) bytes */
-/*
- * When creating the index block, we place the update sequence array at this
- * offset, i.e. before we start with the index entries. This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work. As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) INDEX_BLOCK;
-
-typedef INDEX_BLOCK INDEX_ALLOCATION;
-
-/*
- * The system file FILE_Extend/$Reparse contains an index named $R listing
- * all reparse points on the volume. The index entry keys are as defined
- * below. Note, that there is no index data associated with the index entries.
- *
- * The index entries are sorted by the index key file_id. The collation rule is
- * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the
- * primary key / is not a key at all. (AIA)
- */
-typedef struct {
-	le32 reparse_tag;	/* Reparse point type (inc. flags). */
-	leMFT_REF file_id;	/* Mft record of the file containing the
-				   reparse point attribute. */
-} __attribute__ ((__packed__)) REPARSE_INDEX_KEY;
-
-/*
- * Quota flags (32-bit).
- *
- * The user quota flags.  Names explain meaning.
- */
-enum {
-	QUOTA_FLAG_DEFAULT_LIMITS	= cpu_to_le32(0x00000001),
-	QUOTA_FLAG_LIMIT_REACHED	= cpu_to_le32(0x00000002),
-	QUOTA_FLAG_ID_DELETED		= cpu_to_le32(0x00000004),
-
-	QUOTA_FLAG_USER_MASK		= cpu_to_le32(0x00000007),
-	/* This is a bit mask for the user quota flags. */
-
-	/*
-	 * These flags are only present in the quota defaults index entry, i.e.
-	 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
-	 */
-	QUOTA_FLAG_TRACKING_ENABLED	= cpu_to_le32(0x00000010),
-	QUOTA_FLAG_ENFORCEMENT_ENABLED	= cpu_to_le32(0x00000020),
-	QUOTA_FLAG_TRACKING_REQUESTED	= cpu_to_le32(0x00000040),
-	QUOTA_FLAG_LOG_THRESHOLD	= cpu_to_le32(0x00000080),
-
-	QUOTA_FLAG_LOG_LIMIT		= cpu_to_le32(0x00000100),
-	QUOTA_FLAG_OUT_OF_DATE		= cpu_to_le32(0x00000200),
-	QUOTA_FLAG_CORRUPT		= cpu_to_le32(0x00000400),
-	QUOTA_FLAG_PENDING_DELETES	= cpu_to_le32(0x00000800),
-};
-
-typedef le32 QUOTA_FLAGS;
-
-/*
- * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas
- * are on a per volume and per user basis.
- *
- * The $Q index contains one entry for each existing user_id on the volume. The
- * index key is the user_id of the user/group owning this quota control entry,
- * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the
- * owner_id, is found in the standard information attribute. The collation rule
- * for $Q is COLLATION_NTOFS_ULONG.
- *
- * The $O index contains one entry for each user/group who has been assigned
- * a quota on that volume. The index key holds the SID of the user_id the
- * entry belongs to, i.e. the owner_id. The collation rule for $O is
- * COLLATION_NTOFS_SID.
- *
- * The $O index entry data is the user_id of the user corresponding to the SID.
- * This user_id is used as an index into $Q to find the quota control entry
- * associated with the SID.
- *
- * The $Q index entry data is the quota control entry and is defined below.
- */
-typedef struct {
-	le32 version;		/* Currently equals 2. */
-	QUOTA_FLAGS flags;	/* Flags describing this quota entry. */
-	le64 bytes_used;	/* How many bytes of the quota are in use. */
-	sle64 change_time;	/* Last time this quota entry was changed. */
-	sle64 threshold;	/* Soft quota (-1 if not limited). */
-	sle64 limit;		/* Hard quota (-1 if not limited). */
-	sle64 exceeded_time;	/* How long the soft quota has been exceeded. */
-	SID sid;		/* The SID of the user/object associated with
-				   this quota entry.  Equals zero for the quota
-				   defaults entry (and in fact on a WinXP
-				   volume, it is not present at all). */
-} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY;
-
-/*
- * Predefined owner_id values (32-bit).
- */
-enum {
-	QUOTA_INVALID_ID	= cpu_to_le32(0x00000000),
-	QUOTA_DEFAULTS_ID	= cpu_to_le32(0x00000001),
-	QUOTA_FIRST_USER_ID	= cpu_to_le32(0x00000100),
-};
-
-/*
- * Current constants for quota control entries.
- */
-typedef enum {
-	/* Current version. */
-	QUOTA_VERSION	= 2,
-} QUOTA_CONTROL_ENTRY_CONSTANTS;
-
-/*
- * Index entry flags (16-bit).
- */
-enum {
-	INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
-			sub-node, i.e. a reference to an index block in form of
-			a virtual cluster number (see below). */
-	INDEX_ENTRY_END  = cpu_to_le16(2), /* This signifies the last
-			entry in an index block.  The index entry does not
-			represent a file but it can point to a sub-node. */
-
-	INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
-			enum bit width to 16-bit. */
-} __attribute__ ((__packed__));
-
-typedef le16 INDEX_ENTRY_FLAGS;
-
-/*
- * This the index entry header (see below).
- */
-typedef struct {
-/*  0*/	union {
-		struct { /* Only valid when INDEX_ENTRY_END is not set. */
-			leMFT_REF indexed_file;	/* The mft reference of the file
-						   described by this index
-						   entry. Used for directory
-						   indexes. */
-		} __attribute__ ((__packed__)) dir;
-		struct { /* Used for views/indexes to find the entry's data. */
-			le16 data_offset;	/* Data byte offset from this
-						   INDEX_ENTRY. Follows the
-						   index key. */
-			le16 data_length;	/* Data length in bytes. */
-			le32 reservedV;		/* Reserved (zero). */
-		} __attribute__ ((__packed__)) vi;
-	} __attribute__ ((__packed__)) data;
-/*  8*/	le16 length;		 /* Byte size of this index entry, multiple of
-				    8-bytes. */
-/* 10*/	le16 key_length;	 /* Byte size of the key value, which is in the
-				    index entry. It follows field reserved. Not
-				    multiple of 8-bytes. */
-/* 12*/	INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
-/* 14*/	le16 reserved;		 /* Reserved/align to 8-byte boundary. */
-/* sizeof() = 16 bytes */
-} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER;
-
-/*
- * This is an index entry. A sequence of such entries follows each INDEX_HEADER
- * structure. Together they make up a complete index. The index follows either
- * an index root attribute or an index allocation attribute.
- *
- * NOTE: Before NTFS 3.0 only filename attributes were indexed.
- */
-typedef struct {
-/*Ofs*/
-/*  0	INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */
-	union {
-		struct { /* Only valid when INDEX_ENTRY_END is not set. */
-			leMFT_REF indexed_file;	/* The mft reference of the file
-						   described by this index
-						   entry. Used for directory
-						   indexes. */
-		} __attribute__ ((__packed__)) dir;
-		struct { /* Used for views/indexes to find the entry's data. */
-			le16 data_offset;	/* Data byte offset from this
-						   INDEX_ENTRY. Follows the
-						   index key. */
-			le16 data_length;	/* Data length in bytes. */
-			le32 reservedV;		/* Reserved (zero). */
-		} __attribute__ ((__packed__)) vi;
-	} __attribute__ ((__packed__)) data;
-	le16 length;		 /* Byte size of this index entry, multiple of
-				    8-bytes. */
-	le16 key_length;	 /* Byte size of the key value, which is in the
-				    index entry. It follows field reserved. Not
-				    multiple of 8-bytes. */
-	INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
-	le16 reserved;		 /* Reserved/align to 8-byte boundary. */
-
-/* 16*/	union {		/* The key of the indexed attribute. NOTE: Only present
-			   if INDEX_ENTRY_END bit in flags is not set. NOTE: On
-			   NTFS versions before 3.0 the only valid key is the
-			   FILE_NAME_ATTR. On NTFS 3.0+ the following
-			   additional index keys are defined: */
-		FILE_NAME_ATTR file_name;/* $I30 index in directories. */
-		SII_INDEX_KEY sii;	/* $SII index in $Secure. */
-		SDH_INDEX_KEY sdh;	/* $SDH index in $Secure. */
-		GUID object_id;		/* $O index in FILE_Extend/$ObjId: The
-					   object_id of the mft record found in
-					   the data part of the index. */
-		REPARSE_INDEX_KEY reparse;	/* $R index in
-						   FILE_Extend/$Reparse. */
-		SID sid;		/* $O index in FILE_Extend/$Quota:
-					   SID of the owner of the user_id. */
-		le32 owner_id;		/* $Q index in FILE_Extend/$Quota:
-					   user_id of the owner of the quota
-					   control entry in the data part of
-					   the index. */
-	} __attribute__ ((__packed__)) key;
-	/* The (optional) index data is inserted here when creating. */
-	// leVCN vcn;	/* If INDEX_ENTRY_NODE bit in flags is set, the last
-	//		   eight bytes of this index entry contain the virtual
-	//		   cluster number of the index block that holds the
-	//		   entries immediately preceding the current entry (the
-	//		   vcn references the corresponding cluster in the data
-	//		   of the non-resident index allocation attribute). If
-	//		   the key_length is zero, then the vcn immediately
-	//		   follows the INDEX_ENTRY_HEADER. Regardless of
-	//		   key_length, the address of the 8-byte boundary
-	//		   aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
-	//		   (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
-	//		   where sizeof(VCN) can be hardcoded as 8 if wanted. */
-} __attribute__ ((__packed__)) INDEX_ENTRY;
-
-/*
- * Attribute: Bitmap (0xb0).
- *
- * Contains an array of bits (aka a bitfield).
- *
- * When used in conjunction with the index allocation attribute, each bit
- * corresponds to one index block within the index allocation attribute. Thus
- * the number of bits in the bitmap * index block size / cluster size is the
- * number of clusters in the index allocation attribute.
- */
-typedef struct {
-	u8 bitmap[0];			/* Array of bits. */
-} __attribute__ ((__packed__)) BITMAP_ATTR;
-
-/*
- * The reparse point tag defines the type of the reparse point. It also
- * includes several flags, which further describe the reparse point.
- *
- * The reparse point tag is an unsigned 32-bit value divided in three parts:
- *
- * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of
- *    the reparse point.
- * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use.
- * 3. The most significant three bits are flags describing the reparse point.
- *    They are defined as follows:
- *	bit 29: Name surrogate bit. If set, the filename is an alias for
- *		another object in the system.
- *	bit 30: High-latency bit. If set, accessing the first byte of data will
- *		be slow. (E.g. the data is stored on a tape drive.)
- *	bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User
- *		defined tags have to use zero here.
- *
- * These are the predefined reparse point tags:
- */
-enum {
-	IO_REPARSE_TAG_IS_ALIAS		= cpu_to_le32(0x20000000),
-	IO_REPARSE_TAG_IS_HIGH_LATENCY	= cpu_to_le32(0x40000000),
-	IO_REPARSE_TAG_IS_MICROSOFT	= cpu_to_le32(0x80000000),
-
-	IO_REPARSE_TAG_RESERVED_ZERO	= cpu_to_le32(0x00000000),
-	IO_REPARSE_TAG_RESERVED_ONE	= cpu_to_le32(0x00000001),
-	IO_REPARSE_TAG_RESERVED_RANGE	= cpu_to_le32(0x00000001),
-
-	IO_REPARSE_TAG_NSS		= cpu_to_le32(0x68000005),
-	IO_REPARSE_TAG_NSS_RECOVER	= cpu_to_le32(0x68000006),
-	IO_REPARSE_TAG_SIS		= cpu_to_le32(0x68000007),
-	IO_REPARSE_TAG_DFS		= cpu_to_le32(0x68000008),
-
-	IO_REPARSE_TAG_MOUNT_POINT	= cpu_to_le32(0x88000003),
-
-	IO_REPARSE_TAG_HSM		= cpu_to_le32(0xa8000004),
-
-	IO_REPARSE_TAG_SYMBOLIC_LINK	= cpu_to_le32(0xe8000000),
-
-	IO_REPARSE_TAG_VALID_VALUES	= cpu_to_le32(0xe000ffff),
-};
-
-/*
- * Attribute: Reparse point (0xc0).
- *
- * NOTE: Can be resident or non-resident.
- */
-typedef struct {
-	le32 reparse_tag;		/* Reparse point type (inc. flags). */
-	le16 reparse_data_length;	/* Byte size of reparse data. */
-	le16 reserved;			/* Align to 8-byte boundary. */
-	u8 reparse_data[0];		/* Meaning depends on reparse_tag. */
-} __attribute__ ((__packed__)) REPARSE_POINT;
-
-/*
- * Attribute: Extended attribute (EA) information (0xd0).
- *
- * NOTE: Always resident. (Is this true???)
- */
-typedef struct {
-	le16 ea_length;		/* Byte size of the packed extended
-				   attributes. */
-	le16 need_ea_count;	/* The number of extended attributes which have
-				   the NEED_EA bit set. */
-	le32 ea_query_length;	/* Byte size of the buffer required to query
-				   the extended attributes when calling
-				   ZwQueryEaFile() in Windows NT/2k. I.e. the
-				   byte size of the unpacked extended
-				   attributes. */
-} __attribute__ ((__packed__)) EA_INFORMATION;
-
-/*
- * Extended attribute flags (8-bit).
- */
-enum {
-	NEED_EA	= 0x80		/* If set the file to which the EA belongs
-				   cannot be interpreted without understanding
-				   the associates extended attributes. */
-} __attribute__ ((__packed__));
-
-typedef u8 EA_FLAGS;
-
-/*
- * Attribute: Extended attribute (EA) (0xe0).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Like the attribute list and the index buffer list, the EA attribute value is
- * a sequence of EA_ATTR variable length records.
- */
-typedef struct {
-	le32 next_entry_offset;	/* Offset to the next EA_ATTR. */
-	EA_FLAGS flags;		/* Flags describing the EA. */
-	u8 ea_name_length;	/* Length of the name of the EA in bytes
-				   excluding the '\0' byte terminator. */
-	le16 ea_value_length;	/* Byte size of the EA's value. */
-	u8 ea_name[0];		/* Name of the EA.  Note this is ASCII, not
-				   Unicode and it is zero terminated. */
-	u8 ea_value[0];		/* The value of the EA.  Immediately follows
-				   the name. */
-} __attribute__ ((__packed__)) EA_ATTR;
-
-/*
- * Attribute: Property set (0xf0).
- *
- * Intended to support Native Structure Storage (NSS) - a feature removed from
- * NTFS 3.0 during beta testing.
- */
-typedef struct {
-	/* Irrelevant as feature unused. */
-} __attribute__ ((__packed__)) PROPERTY_SET;
-
-/*
- * Attribute: Logged utility stream (0x100).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Operations on this attribute are logged to the journal ($LogFile) like
- * normal metadata changes.
- *
- * Used by the Encrypting File System (EFS). All encrypted files have this
- * attribute with the name $EFS.
- */
-typedef struct {
-	/* Can be anything the creator chooses. */
-	/* EFS uses it as follows: */
-	// FIXME: Type this info, verifying it along the way. (AIA)
-} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR;
-
-#endif /* _LINUX_NTFS_LAYOUT_H */
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
deleted file mode 100644
index eda9972e6159..000000000000
--- a/fs/ntfs/lcnalloc.c
+++ /dev/null
@@ -1,1000 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * lcnalloc.c - Cluster (de)allocation code.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/pagemap.h>
-
-#include "lcnalloc.h"
-#include "debug.h"
-#include "bitmap.h"
-#include "inode.h"
-#include "volume.h"
-#include "attrib.h"
-#include "malloc.h"
-#include "aops.h"
-#include "ntfs.h"
-
-/**
- * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
- * @vol:	mounted ntfs volume on which to free the clusters
- * @rl:		runlist describing the clusters to free
- *
- * Free all the clusters described by the runlist @rl on the volume @vol.  In
- * the case of an error being returned, at least some of the clusters were not
- * freed.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - The volume lcn bitmap must be locked for writing on entry and is
- *	      left locked on return.
- */
-int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
-		const runlist_element *rl)
-{
-	struct inode *lcnbmp_vi = vol->lcnbmp_ino;
-	int ret = 0;
-
-	ntfs_debug("Entering.");
-	if (!rl)
-		return 0;
-	for (; rl->length; rl++) {
-		int err;
-
-		if (rl->lcn < 0)
-			continue;
-		err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
-		if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err))
-			ret = err;
-	}
-	ntfs_debug("Done.");
-	return ret;
-}
-
-/**
- * ntfs_cluster_alloc - allocate clusters on an ntfs volume
- * @vol:	mounted ntfs volume on which to allocate the clusters
- * @start_vcn:	vcn to use for the first allocated cluster
- * @count:	number of clusters to allocate
- * @start_lcn:	starting lcn at which to allocate the clusters (or -1 if none)
- * @zone:	zone from which to allocate the clusters
- * @is_extension:	if 'true', this is an attribute extension
- *
- * Allocate @count clusters preferably starting at cluster @start_lcn or at the
- * current allocator position if @start_lcn is -1, on the mounted ntfs volume
- * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
- * MFT_ZONE for allocation of clusters for the master file table, i.e. the
- * $MFT/$DATA attribute.
- *
- * @start_vcn specifies the vcn of the first allocated cluster.  This makes
- * merging the resulting runlist with the old runlist easier.
- *
- * If @is_extension is 'true', the caller is allocating clusters to extend an
- * attribute and if it is 'false', the caller is allocating clusters to fill a
- * hole in an attribute.  Practically the difference is that if @is_extension
- * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
- * @is_extension is 'false' the runlist will be terminated with
- * LCN_RL_NOT_MAPPED.
- *
- * You need to check the return value with IS_ERR().  If this is false, the
- * function was successful and the return value is a runlist describing the
- * allocated cluster(s).  If IS_ERR() is true, the function failed and
- * PTR_ERR() gives you the error code.
- *
- * Notes on the allocation algorithm
- * =================================
- *
- * There are two data zones.  First is the area between the end of the mft zone
- * and the end of the volume, and second is the area between the start of the
- * volume and the start of the mft zone.  On unmodified/standard NTFS 1.x
- * volumes, the second data zone does not exist due to the mft zone being
- * expanded to cover the start of the volume in order to reserve space for the
- * mft bitmap attribute.
- *
- * This is not the prettiest function but the complexity stems from the need of
- * implementing the mft vs data zoned approach and from the fact that we have
- * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
- * need to cope with crossing over boundaries of two buffers.  Further, the
- * fact that the allocator allows for caller supplied hints as to the location
- * of where allocation should begin and the fact that the allocator keeps track
- * of where in the data zones the next natural allocation should occur,
- * contribute to the complexity of the function.  But it should all be
- * worthwhile, because this allocator should: 1) be a full implementation of
- * the MFT zone approach used by Windows NT, 2) cause reduction in
- * fragmentation, and 3) be speedy in allocations (the code is not optimized
- * for speed, but the algorithm is, so further speed improvements are probably
- * possible).
- *
- * FIXME: We should be monitoring cluster allocation and increment the MFT zone
- * size dynamically but this is something for the future.  We will just cause
- * heavier fragmentation by not doing it and I am not even sure Windows would
- * grow the MFT zone dynamically, so it might even be correct not to do this.
- * The overhead in doing dynamic MFT zone expansion would be very large and
- * unlikely worth the effort. (AIA)
- *
- * TODO: I have added in double the required zone position pointer wrap around
- * logic which can be optimized to having only one of the two logic sets.
- * However, having the double logic will work fine, but if we have only one of
- * the sets and we get it wrong somewhere, then we get into trouble, so
- * removing the duplicate logic requires _very_ careful consideration of _all_
- * possible code paths.  So at least for now, I am leaving the double logic -
- * better safe than sorry... (AIA)
- *
- * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
- *	      on return.
- *	    - This function takes the volume lcn bitmap lock for writing and
- *	      modifies the bitmap contents.
- */
-runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
-		const s64 count, const LCN start_lcn,
-		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
-		const bool is_extension)
-{
-	LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
-	LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
-	s64 clusters;
-	loff_t i_size;
-	struct inode *lcnbmp_vi;
-	runlist_element *rl = NULL;
-	struct address_space *mapping;
-	struct page *page = NULL;
-	u8 *buf, *byte;
-	int err = 0, rlpos, rlsize, buf_size;
-	u8 pass, done_zones, search_zone, need_writeback = 0, bit;
-
-	ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
-			"0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
-			(unsigned long long)count,
-			(unsigned long long)start_lcn,
-			zone == MFT_ZONE ? "MFT" : "DATA");
-	BUG_ON(!vol);
-	lcnbmp_vi = vol->lcnbmp_ino;
-	BUG_ON(!lcnbmp_vi);
-	BUG_ON(start_vcn < 0);
-	BUG_ON(count < 0);
-	BUG_ON(start_lcn < -1);
-	BUG_ON(zone < FIRST_ZONE);
-	BUG_ON(zone > LAST_ZONE);
-
-	/* Return NULL if @count is zero. */
-	if (!count)
-		return NULL;
-	/* Take the lcnbmp lock for writing. */
-	down_write(&vol->lcnbmp_lock);
-	/*
-	 * If no specific @start_lcn was requested, use the current data zone
-	 * position, otherwise use the requested @start_lcn but make sure it
-	 * lies outside the mft zone.  Also set done_zones to 0 (no zones done)
-	 * and pass depending on whether we are starting inside a zone (1) or
-	 * at the beginning of a zone (2).  If requesting from the MFT_ZONE,
-	 * we either start at the current position within the mft zone or at
-	 * the specified position.  If the latter is out of bounds then we start
-	 * at the beginning of the MFT_ZONE.
-	 */
-	done_zones = 0;
-	pass = 1;
-	/*
-	 * zone_start and zone_end are the current search range.  search_zone
-	 * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
-	 * volume) and 4 for data zone 2 (start of volume till start of mft
-	 * zone).
-	 */
-	zone_start = start_lcn;
-	if (zone_start < 0) {
-		if (zone == DATA_ZONE)
-			zone_start = vol->data1_zone_pos;
-		else
-			zone_start = vol->mft_zone_pos;
-		if (!zone_start) {
-			/*
-			 * Zone starts at beginning of volume which means a
-			 * single pass is sufficient.
-			 */
-			pass = 2;
-		}
-	} else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
-			zone_start < vol->mft_zone_end) {
-		zone_start = vol->mft_zone_end;
-		/*
-		 * Starting at beginning of data1_zone which means a single
-		 * pass in this zone is sufficient.
-		 */
-		pass = 2;
-	} else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
-			zone_start >= vol->mft_zone_end)) {
-		zone_start = vol->mft_lcn;
-		if (!vol->mft_zone_end)
-			zone_start = 0;
-		/*
-		 * Starting at beginning of volume which means a single pass
-		 * is sufficient.
-		 */
-		pass = 2;
-	}
-	if (zone == MFT_ZONE) {
-		zone_end = vol->mft_zone_end;
-		search_zone = 1;
-	} else /* if (zone == DATA_ZONE) */ {
-		/* Skip searching the mft zone. */
-		done_zones |= 1;
-		if (zone_start >= vol->mft_zone_end) {
-			zone_end = vol->nr_clusters;
-			search_zone = 2;
-		} else {
-			zone_end = vol->mft_zone_start;
-			search_zone = 4;
-		}
-	}
-	/*
-	 * bmp_pos is the current bit position inside the bitmap.  We use
-	 * bmp_initial_pos to determine whether or not to do a zone switch.
-	 */
-	bmp_pos = bmp_initial_pos = zone_start;
-
-	/* Loop until all clusters are allocated, i.e. clusters == 0. */
-	clusters = count;
-	rlpos = rlsize = 0;
-	mapping = lcnbmp_vi->i_mapping;
-	i_size = i_size_read(lcnbmp_vi);
-	while (1) {
-		ntfs_debug("Start of outer while loop: done_zones 0x%x, "
-				"search_zone %i, pass %i, zone_start 0x%llx, "
-				"zone_end 0x%llx, bmp_initial_pos 0x%llx, "
-				"bmp_pos 0x%llx, rlpos %i, rlsize %i.",
-				done_zones, search_zone, pass,
-				(unsigned long long)zone_start,
-				(unsigned long long)zone_end,
-				(unsigned long long)bmp_initial_pos,
-				(unsigned long long)bmp_pos, rlpos, rlsize);
-		/* Loop until we run out of free clusters. */
-		last_read_pos = bmp_pos >> 3;
-		ntfs_debug("last_read_pos 0x%llx.",
-				(unsigned long long)last_read_pos);
-		if (last_read_pos > i_size) {
-			ntfs_debug("End of attribute reached.  "
-					"Skipping to zone_pass_done.");
-			goto zone_pass_done;
-		}
-		if (likely(page)) {
-			if (need_writeback) {
-				ntfs_debug("Marking page dirty.");
-				flush_dcache_page(page);
-				set_page_dirty(page);
-				need_writeback = 0;
-			}
-			ntfs_unmap_page(page);
-		}
-		page = ntfs_map_page(mapping, last_read_pos >>
-				PAGE_SHIFT);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			ntfs_error(vol->sb, "Failed to map page.");
-			goto out;
-		}
-		buf_size = last_read_pos & ~PAGE_MASK;
-		buf = page_address(page) + buf_size;
-		buf_size = PAGE_SIZE - buf_size;
-		if (unlikely(last_read_pos + buf_size > i_size))
-			buf_size = i_size - last_read_pos;
-		buf_size <<= 3;
-		lcn = bmp_pos & 7;
-		bmp_pos &= ~(LCN)7;
-		ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
-				"bmp_pos 0x%llx, need_writeback %i.", buf_size,
-				(unsigned long long)lcn,
-				(unsigned long long)bmp_pos, need_writeback);
-		while (lcn < buf_size && lcn + bmp_pos < zone_end) {
-			byte = buf + (lcn >> 3);
-			ntfs_debug("In inner while loop: buf_size %i, "
-					"lcn 0x%llx, bmp_pos 0x%llx, "
-					"need_writeback %i, byte ofs 0x%x, "
-					"*byte 0x%x.", buf_size,
-					(unsigned long long)lcn,
-					(unsigned long long)bmp_pos,
-					need_writeback,
-					(unsigned int)(lcn >> 3),
-					(unsigned int)*byte);
-			/* Skip full bytes. */
-			if (*byte == 0xff) {
-				lcn = (lcn + 8) & ~(LCN)7;
-				ntfs_debug("Continuing while loop 1.");
-				continue;
-			}
-			bit = 1 << (lcn & 7);
-			ntfs_debug("bit 0x%x.", bit);
-			/* If the bit is already set, go onto the next one. */
-			if (*byte & bit) {
-				lcn++;
-				ntfs_debug("Continuing while loop 2.");
-				continue;
-			}
-			/*
-			 * Allocate more memory if needed, including space for
-			 * the terminator element.
-			 * ntfs_malloc_nofs() operates on whole pages only.
-			 */
-			if ((rlpos + 2) * sizeof(*rl) > rlsize) {
-				runlist_element *rl2;
-
-				ntfs_debug("Reallocating memory.");
-				if (!rl)
-					ntfs_debug("First free bit is at LCN "
-							"0x%llx.",
-							(unsigned long long)
-							(lcn + bmp_pos));
-				rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
-				if (unlikely(!rl2)) {
-					err = -ENOMEM;
-					ntfs_error(vol->sb, "Failed to "
-							"allocate memory.");
-					goto out;
-				}
-				memcpy(rl2, rl, rlsize);
-				ntfs_free(rl);
-				rl = rl2;
-				rlsize += PAGE_SIZE;
-				ntfs_debug("Reallocated memory, rlsize 0x%x.",
-						rlsize);
-			}
-			/* Allocate the bitmap bit. */
-			*byte |= bit;
-			/* We need to write this bitmap page to disk. */
-			need_writeback = 1;
-			ntfs_debug("*byte 0x%x, need_writeback is set.",
-					(unsigned int)*byte);
-			/*
-			 * Coalesce with previous run if adjacent LCNs.
-			 * Otherwise, append a new run.
-			 */
-			ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
-					"prev_lcn 0x%llx, lcn 0x%llx, "
-					"bmp_pos 0x%llx, prev_run_len 0x%llx, "
-					"rlpos %i.",
-					(unsigned long long)(lcn + bmp_pos),
-					1ULL, (unsigned long long)prev_lcn,
-					(unsigned long long)lcn,
-					(unsigned long long)bmp_pos,
-					(unsigned long long)prev_run_len,
-					rlpos);
-			if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
-				ntfs_debug("Coalescing to run (lcn 0x%llx, "
-						"len 0x%llx).",
-						(unsigned long long)
-						rl[rlpos - 1].lcn,
-						(unsigned long long)
-						rl[rlpos - 1].length);
-				rl[rlpos - 1].length = ++prev_run_len;
-				ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
-						"prev_run_len 0x%llx.",
-						(unsigned long long)
-						rl[rlpos - 1].lcn,
-						(unsigned long long)
-						rl[rlpos - 1].length,
-						(unsigned long long)
-						prev_run_len);
-			} else {
-				if (likely(rlpos)) {
-					ntfs_debug("Adding new run, (previous "
-							"run lcn 0x%llx, "
-							"len 0x%llx).",
-							(unsigned long long)
-							rl[rlpos - 1].lcn,
-							(unsigned long long)
-							rl[rlpos - 1].length);
-					rl[rlpos].vcn = rl[rlpos - 1].vcn +
-							prev_run_len;
-				} else {
-					ntfs_debug("Adding new run, is first "
-							"run.");
-					rl[rlpos].vcn = start_vcn;
-				}
-				rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
-				rl[rlpos].length = prev_run_len = 1;
-				rlpos++;
-			}
-			/* Done? */
-			if (!--clusters) {
-				LCN tc;
-				/*
-				 * Update the current zone position.  Positions
-				 * of already scanned zones have been updated
-				 * during the respective zone switches.
-				 */
-				tc = lcn + bmp_pos + 1;
-				ntfs_debug("Done. Updating current zone "
-						"position, tc 0x%llx, "
-						"search_zone %i.",
-						(unsigned long long)tc,
-						search_zone);
-				switch (search_zone) {
-				case 1:
-					ntfs_debug("Before checks, "
-							"vol->mft_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->mft_zone_pos);
-					if (tc >= vol->mft_zone_end) {
-						vol->mft_zone_pos =
-								vol->mft_lcn;
-						if (!vol->mft_zone_end)
-							vol->mft_zone_pos = 0;
-					} else if ((bmp_initial_pos >=
-							vol->mft_zone_pos ||
-							tc > vol->mft_zone_pos)
-							&& tc >= vol->mft_lcn)
-						vol->mft_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->mft_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->mft_zone_pos);
-					break;
-				case 2:
-					ntfs_debug("Before checks, "
-							"vol->data1_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data1_zone_pos);
-					if (tc >= vol->nr_clusters)
-						vol->data1_zone_pos =
-							     vol->mft_zone_end;
-					else if ((bmp_initial_pos >=
-						    vol->data1_zone_pos ||
-						    tc > vol->data1_zone_pos)
-						    && tc >= vol->mft_zone_end)
-						vol->data1_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->data1_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data1_zone_pos);
-					break;
-				case 4:
-					ntfs_debug("Before checks, "
-							"vol->data2_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data2_zone_pos);
-					if (tc >= vol->mft_zone_start)
-						vol->data2_zone_pos = 0;
-					else if (bmp_initial_pos >=
-						      vol->data2_zone_pos ||
-						      tc > vol->data2_zone_pos)
-						vol->data2_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->data2_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data2_zone_pos);
-					break;
-				default:
-					BUG();
-				}
-				ntfs_debug("Finished.  Going to out.");
-				goto out;
-			}
-			lcn++;
-		}
-		bmp_pos += buf_size;
-		ntfs_debug("After inner while loop: buf_size 0x%x, lcn "
-				"0x%llx, bmp_pos 0x%llx, need_writeback %i.",
-				buf_size, (unsigned long long)lcn,
-				(unsigned long long)bmp_pos, need_writeback);
-		if (bmp_pos < zone_end) {
-			ntfs_debug("Continuing outer while loop, "
-					"bmp_pos 0x%llx, zone_end 0x%llx.",
-					(unsigned long long)bmp_pos,
-					(unsigned long long)zone_end);
-			continue;
-		}
-zone_pass_done:	/* Finished with the current zone pass. */
-		ntfs_debug("At zone_pass_done, pass %i.", pass);
-		if (pass == 1) {
-			/*
-			 * Now do pass 2, scanning the first part of the zone
-			 * we omitted in pass 1.
-			 */
-			pass = 2;
-			zone_end = zone_start;
-			switch (search_zone) {
-			case 1: /* mft_zone */
-				zone_start = vol->mft_zone_start;
-				break;
-			case 2: /* data1_zone */
-				zone_start = vol->mft_zone_end;
-				break;
-			case 4: /* data2_zone */
-				zone_start = 0;
-				break;
-			default:
-				BUG();
-			}
-			/* Sanity check. */
-			if (zone_end < zone_start)
-				zone_end = zone_start;
-			bmp_pos = zone_start;
-			ntfs_debug("Continuing outer while loop, pass 2, "
-					"zone_start 0x%llx, zone_end 0x%llx, "
-					"bmp_pos 0x%llx.",
-					(unsigned long long)zone_start,
-					(unsigned long long)zone_end,
-					(unsigned long long)bmp_pos);
-			continue;
-		} /* pass == 2 */
-done_zones_check:
-		ntfs_debug("At done_zones_check, search_zone %i, done_zones "
-				"before 0x%x, done_zones after 0x%x.",
-				search_zone, done_zones,
-				done_zones | search_zone);
-		done_zones |= search_zone;
-		if (done_zones < 7) {
-			ntfs_debug("Switching zone.");
-			/* Now switch to the next zone we haven't done yet. */
-			pass = 1;
-			switch (search_zone) {
-			case 1:
-				ntfs_debug("Switching from mft zone to data1 "
-						"zone.");
-				/* Update mft zone position. */
-				if (rlpos) {
-					LCN tc;
-
-					ntfs_debug("Before checks, "
-							"vol->mft_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->mft_zone_pos);
-					tc = rl[rlpos - 1].lcn +
-							rl[rlpos - 1].length;
-					if (tc >= vol->mft_zone_end) {
-						vol->mft_zone_pos =
-								vol->mft_lcn;
-						if (!vol->mft_zone_end)
-							vol->mft_zone_pos = 0;
-					} else if ((bmp_initial_pos >=
-							vol->mft_zone_pos ||
-							tc > vol->mft_zone_pos)
-							&& tc >= vol->mft_lcn)
-						vol->mft_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->mft_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->mft_zone_pos);
-				}
-				/* Switch from mft zone to data1 zone. */
-switch_to_data1_zone:		search_zone = 2;
-				zone_start = bmp_initial_pos =
-						vol->data1_zone_pos;
-				zone_end = vol->nr_clusters;
-				if (zone_start == vol->mft_zone_end)
-					pass = 2;
-				if (zone_start >= zone_end) {
-					vol->data1_zone_pos = zone_start =
-							vol->mft_zone_end;
-					pass = 2;
-				}
-				break;
-			case 2:
-				ntfs_debug("Switching from data1 zone to "
-						"data2 zone.");
-				/* Update data1 zone position. */
-				if (rlpos) {
-					LCN tc;
-
-					ntfs_debug("Before checks, "
-							"vol->data1_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data1_zone_pos);
-					tc = rl[rlpos - 1].lcn +
-							rl[rlpos - 1].length;
-					if (tc >= vol->nr_clusters)
-						vol->data1_zone_pos =
-							     vol->mft_zone_end;
-					else if ((bmp_initial_pos >=
-						    vol->data1_zone_pos ||
-						    tc > vol->data1_zone_pos)
-						    && tc >= vol->mft_zone_end)
-						vol->data1_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->data1_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data1_zone_pos);
-				}
-				/* Switch from data1 zone to data2 zone. */
-				search_zone = 4;
-				zone_start = bmp_initial_pos =
-						vol->data2_zone_pos;
-				zone_end = vol->mft_zone_start;
-				if (!zone_start)
-					pass = 2;
-				if (zone_start >= zone_end) {
-					vol->data2_zone_pos = zone_start =
-							bmp_initial_pos = 0;
-					pass = 2;
-				}
-				break;
-			case 4:
-				ntfs_debug("Switching from data2 zone to "
-						"data1 zone.");
-				/* Update data2 zone position. */
-				if (rlpos) {
-					LCN tc;
-
-					ntfs_debug("Before checks, "
-							"vol->data2_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data2_zone_pos);
-					tc = rl[rlpos - 1].lcn +
-							rl[rlpos - 1].length;
-					if (tc >= vol->mft_zone_start)
-						vol->data2_zone_pos = 0;
-					else if (bmp_initial_pos >=
-						      vol->data2_zone_pos ||
-						      tc > vol->data2_zone_pos)
-						vol->data2_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->data2_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data2_zone_pos);
-				}
-				/* Switch from data2 zone to data1 zone. */
-				goto switch_to_data1_zone;
-			default:
-				BUG();
-			}
-			ntfs_debug("After zone switch, search_zone %i, "
-					"pass %i, bmp_initial_pos 0x%llx, "
-					"zone_start 0x%llx, zone_end 0x%llx.",
-					search_zone, pass,
-					(unsigned long long)bmp_initial_pos,
-					(unsigned long long)zone_start,
-					(unsigned long long)zone_end);
-			bmp_pos = zone_start;
-			if (zone_start == zone_end) {
-				ntfs_debug("Empty zone, going to "
-						"done_zones_check.");
-				/* Empty zone. Don't bother searching it. */
-				goto done_zones_check;
-			}
-			ntfs_debug("Continuing outer while loop.");
-			continue;
-		} /* done_zones == 7 */
-		ntfs_debug("All zones are finished.");
-		/*
-		 * All zones are finished!  If DATA_ZONE, shrink mft zone.  If
-		 * MFT_ZONE, we have really run out of space.
-		 */
-		mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
-		ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
-				"0x%llx, mft_zone_size 0x%llx.",
-				(unsigned long long)vol->mft_zone_start,
-				(unsigned long long)vol->mft_zone_end,
-				(unsigned long long)mft_zone_size);
-		if (zone == MFT_ZONE || mft_zone_size <= 0) {
-			ntfs_debug("No free clusters left, going to out.");
-			/* Really no more space left on device. */
-			err = -ENOSPC;
-			goto out;
-		} /* zone == DATA_ZONE && mft_zone_size > 0 */
-		ntfs_debug("Shrinking mft zone.");
-		zone_end = vol->mft_zone_end;
-		mft_zone_size >>= 1;
-		if (mft_zone_size > 0)
-			vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
-		else /* mft zone and data2 zone no longer exist. */
-			vol->data2_zone_pos = vol->mft_zone_start =
-					vol->mft_zone_end = 0;
-		if (vol->mft_zone_pos >= vol->mft_zone_end) {
-			vol->mft_zone_pos = vol->mft_lcn;
-			if (!vol->mft_zone_end)
-				vol->mft_zone_pos = 0;
-		}
-		bmp_pos = zone_start = bmp_initial_pos =
-				vol->data1_zone_pos = vol->mft_zone_end;
-		search_zone = 2;
-		pass = 2;
-		done_zones &= ~2;
-		ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
-				"vol->mft_zone_start 0x%llx, "
-				"vol->mft_zone_end 0x%llx, "
-				"vol->mft_zone_pos 0x%llx, search_zone 2, "
-				"pass 2, dones_zones 0x%x, zone_start 0x%llx, "
-				"zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
-				"continuing outer while loop.",
-				(unsigned long long)mft_zone_size,
-				(unsigned long long)vol->mft_zone_start,
-				(unsigned long long)vol->mft_zone_end,
-				(unsigned long long)vol->mft_zone_pos,
-				done_zones, (unsigned long long)zone_start,
-				(unsigned long long)zone_end,
-				(unsigned long long)vol->data1_zone_pos);
-	}
-	ntfs_debug("After outer while loop.");
-out:
-	ntfs_debug("At out.");
-	/* Add runlist terminator element. */
-	if (likely(rl)) {
-		rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
-		rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
-		rl[rlpos].length = 0;
-	}
-	if (likely(page && !IS_ERR(page))) {
-		if (need_writeback) {
-			ntfs_debug("Marking page dirty.");
-			flush_dcache_page(page);
-			set_page_dirty(page);
-			need_writeback = 0;
-		}
-		ntfs_unmap_page(page);
-	}
-	if (likely(!err)) {
-		up_write(&vol->lcnbmp_lock);
-		ntfs_debug("Done.");
-		return rl;
-	}
-	ntfs_error(vol->sb, "Failed to allocate clusters, aborting "
-			"(error %i).", err);
-	if (rl) {
-		int err2;
-
-		if (err == -ENOSPC)
-			ntfs_debug("Not enough space to complete allocation, "
-					"err -ENOSPC, first free lcn 0x%llx, "
-					"could allocate up to 0x%llx "
-					"clusters.",
-					(unsigned long long)rl[0].lcn,
-					(unsigned long long)(count - clusters));
-		/* Deallocate all allocated clusters. */
-		ntfs_debug("Attempting rollback...");
-		err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
-		if (err2) {
-			ntfs_error(vol->sb, "Failed to rollback (error %i).  "
-					"Leaving inconsistent metadata!  "
-					"Unmount and run chkdsk.", err2);
-			NVolSetErrors(vol);
-		}
-		/* Free the runlist. */
-		ntfs_free(rl);
-	} else if (err == -ENOSPC)
-		ntfs_debug("No space left at all, err = -ENOSPC, first free "
-				"lcn = 0x%llx.",
-				(long long)vol->data1_zone_pos);
-	up_write(&vol->lcnbmp_lock);
-	return ERR_PTR(err);
-}
-
-/**
- * __ntfs_cluster_free - free clusters on an ntfs volume
- * @ni:		ntfs inode whose runlist describes the clusters to free
- * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
- * @count:	number of clusters to free or -1 for all clusters
- * @ctx:	active attribute search context if present or NULL if not
- * @is_rollback:	true if this is a rollback operation
- *
- * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the vfs inode @ni.
- *
- * If @count is -1, all clusters from @start_vcn to the end of the runlist are
- * deallocated.  Thus, to completely free all clusters in a runlist, use
- * @start_vcn = 0 and @count = -1.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record.  This is needed when __ntfs_cluster_free() encounters unmapped
- * runlist fragments and allows their mapping.  If you do not have the mft
- * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
- * perform the necessary mapping and unmapping.
- *
- * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
- * before returning.  Thus, @ctx will be left pointing to the same attribute on
- * return as on entry.  However, the actual pointers in @ctx may point to
- * different memory locations on return, so you must remember to reset any
- * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
- * you will probably want to do:
- *	m = ctx->mrec;
- *	a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * @is_rollback should always be 'false', it is for internal use to rollback
- * errors.  You probably want to use ntfs_cluster_free() instead.
- *
- * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
- * remove from the runlist or mark sparse the freed runs later.
- *
- * Return the number of deallocated clusters (not counting sparse ones) on
- * success and -errno on error.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- *	    is no longer valid, i.e. you need to either call
- *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
- *	    why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- *	      and is locked on return.  Note the runlist may be modified when
- *	      needed runlist fragments need to be mapped.
- *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
- *	      on return.
- *	    - This function takes the volume lcn bitmap lock for writing and
- *	      modifies the bitmap contents.
- *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
- *	      entry and it will be left unmapped on return.
- *	    - If @ctx is not NULL, the base mft record must be mapped on entry
- *	      and it will be left mapped on return.
- */
-s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
-		ntfs_attr_search_ctx *ctx, const bool is_rollback)
-{
-	s64 delta, to_free, total_freed, real_freed;
-	ntfs_volume *vol;
-	struct inode *lcnbmp_vi;
-	runlist_element *rl;
-	int err;
-
-	BUG_ON(!ni);
-	ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
-			"0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn,
-			(unsigned long long)count,
-			is_rollback ? " (rollback)" : "");
-	vol = ni->vol;
-	lcnbmp_vi = vol->lcnbmp_ino;
-	BUG_ON(!lcnbmp_vi);
-	BUG_ON(start_vcn < 0);
-	BUG_ON(count < -1);
-	/*
-	 * Lock the lcn bitmap for writing but only if not rolling back.  We
-	 * must hold the lock all the way including through rollback otherwise
-	 * rollback is not possible because once we have cleared a bit and
-	 * dropped the lock, anyone could have set the bit again, thus
-	 * allocating the cluster for another use.
-	 */
-	if (likely(!is_rollback))
-		down_write(&vol->lcnbmp_lock);
-
-	total_freed = real_freed = 0;
-
-	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
-	if (IS_ERR(rl)) {
-		if (!is_rollback)
-			ntfs_error(vol->sb, "Failed to find first runlist "
-					"element (error %li), aborting.",
-					PTR_ERR(rl));
-		err = PTR_ERR(rl);
-		goto err_out;
-	}
-	if (unlikely(rl->lcn < LCN_HOLE)) {
-		if (!is_rollback)
-			ntfs_error(vol->sb, "First runlist element has "
-					"invalid lcn, aborting.");
-		err = -EIO;
-		goto err_out;
-	}
-	/* Find the starting cluster inside the run that needs freeing. */
-	delta = start_vcn - rl->vcn;
-
-	/* The number of clusters in this run that need freeing. */
-	to_free = rl->length - delta;
-	if (count >= 0 && to_free > count)
-		to_free = count;
-
-	if (likely(rl->lcn >= 0)) {
-		/* Do the actual freeing of the clusters in this run. */
-		err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
-				to_free, likely(!is_rollback) ? 0 : 1);
-		if (unlikely(err)) {
-			if (!is_rollback)
-				ntfs_error(vol->sb, "Failed to clear first run "
-						"(error %i), aborting.", err);
-			goto err_out;
-		}
-		/* We have freed @to_free real clusters. */
-		real_freed = to_free;
-	};
-	/* Go to the next run and adjust the number of clusters left to free. */
-	++rl;
-	if (count >= 0)
-		count -= to_free;
-
-	/* Keep track of the total "freed" clusters, including sparse ones. */
-	total_freed = to_free;
-	/*
-	 * Loop over the remaining runs, using @count as a capping value, and
-	 * free them.
-	 */
-	for (; rl->length && count != 0; ++rl) {
-		if (unlikely(rl->lcn < LCN_HOLE)) {
-			VCN vcn;
-
-			/* Attempt to map runlist. */
-			vcn = rl->vcn;
-			rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
-			if (IS_ERR(rl)) {
-				err = PTR_ERR(rl);
-				if (!is_rollback)
-					ntfs_error(vol->sb, "Failed to map "
-							"runlist fragment or "
-							"failed to find "
-							"subsequent runlist "
-							"element.");
-				goto err_out;
-			}
-			if (unlikely(rl->lcn < LCN_HOLE)) {
-				if (!is_rollback)
-					ntfs_error(vol->sb, "Runlist element "
-							"has invalid lcn "
-							"(0x%llx).",
-							(unsigned long long)
-							rl->lcn);
-				err = -EIO;
-				goto err_out;
-			}
-		}
-		/* The number of clusters in this run that need freeing. */
-		to_free = rl->length;
-		if (count >= 0 && to_free > count)
-			to_free = count;
-
-		if (likely(rl->lcn >= 0)) {
-			/* Do the actual freeing of the clusters in the run. */
-			err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
-					to_free, likely(!is_rollback) ? 0 : 1);
-			if (unlikely(err)) {
-				if (!is_rollback)
-					ntfs_error(vol->sb, "Failed to clear "
-							"subsequent run.");
-				goto err_out;
-			}
-			/* We have freed @to_free real clusters. */
-			real_freed += to_free;
-		}
-		/* Adjust the number of clusters left to free. */
-		if (count >= 0)
-			count -= to_free;
-	
-		/* Update the total done clusters. */
-		total_freed += to_free;
-	}
-	if (likely(!is_rollback))
-		up_write(&vol->lcnbmp_lock);
-
-	BUG_ON(count > 0);
-
-	/* We are done.  Return the number of actually freed clusters. */
-	ntfs_debug("Done.");
-	return real_freed;
-err_out:
-	if (is_rollback)
-		return err;
-	/* If no real clusters were freed, no need to rollback. */
-	if (!real_freed) {
-		up_write(&vol->lcnbmp_lock);
-		return err;
-	}
-	/*
-	 * Attempt to rollback and if that succeeds just return the error code.
-	 * If rollback fails, set the volume errors flag, emit an error
-	 * message, and return the error code.
-	 */
-	delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
-	if (delta < 0) {
-		ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
-				"inconsistent metadata!  Unmount and run "
-				"chkdsk.", (int)delta);
-		NVolSetErrors(vol);
-	}
-	up_write(&vol->lcnbmp_lock);
-	ntfs_error(vol->sb, "Aborting (error %i).", err);
-	return err;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
deleted file mode 100644
index 1589a6d8434b..000000000000
--- a/fs/ntfs/lcnalloc.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation.  Part of the
- *		Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_LCNALLOC_H
-#define _LINUX_NTFS_LCNALLOC_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "attrib.h"
-#include "types.h"
-#include "inode.h"
-#include "runlist.h"
-#include "volume.h"
-
-typedef enum {
-	FIRST_ZONE	= 0,	/* For sanity checking. */
-	MFT_ZONE	= 0,	/* Allocate from $MFT zone. */
-	DATA_ZONE	= 1,	/* Allocate from $DATA zone. */
-	LAST_ZONE	= 1,	/* For sanity checking. */
-} NTFS_CLUSTER_ALLOCATION_ZONES;
-
-extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
-		const VCN start_vcn, const s64 count, const LCN start_lcn,
-		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
-		const bool is_extension);
-
-extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
-		s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback);
-
-/**
- * ntfs_cluster_free - free clusters on an ntfs volume
- * @ni:		ntfs inode whose runlist describes the clusters to free
- * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
- * @count:	number of clusters to free or -1 for all clusters
- * @ctx:	active attribute search context if present or NULL if not
- *
- * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the ntfs inode @ni.
- *
- * If @count is -1, all clusters from @start_vcn to the end of the runlist are
- * deallocated.  Thus, to completely free all clusters in a runlist, use
- * @start_vcn = 0 and @count = -1.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record.  This is needed when ntfs_cluster_free() encounters unmapped runlist
- * fragments and allows their mapping.  If you do not have the mft record
- * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
- * the necessary mapping and unmapping.
- *
- * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
- * before returning.  Thus, @ctx will be left pointing to the same attribute on
- * return as on entry.  However, the actual pointers in @ctx may point to
- * different memory locations on return, so you must remember to reset any
- * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
- * you will probably want to do:
- *	m = ctx->mrec;
- *	a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
- * from the runlist or mark sparse the freed runs later.
- *
- * Return the number of deallocated clusters (not counting sparse ones) on
- * success and -errno on error.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- *	    is no longer valid, i.e. you need to either call
- *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
- *	    why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- *	      and is locked on return.  Note the runlist may be modified when
- *	      needed runlist fragments need to be mapped.
- *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
- *	      on return.
- *	    - This function takes the volume lcn bitmap lock for writing and
- *	      modifies the bitmap contents.
- *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
- *	      entry and it will be left unmapped on return.
- *	    - If @ctx is not NULL, the base mft record must be mapped on entry
- *	      and it will be left mapped on return.
- */
-static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
-		s64 count, ntfs_attr_search_ctx *ctx)
-{
-	return __ntfs_cluster_free(ni, start_vcn, count, ctx, false);
-}
-
-extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
-		const runlist_element *rl);
-
-/**
- * ntfs_cluster_free_from_rl - free clusters from runlist
- * @vol:	mounted ntfs volume on which to free the clusters
- * @rl:		runlist describing the clusters to free
- *
- * Free all the clusters described by the runlist @rl on the volume @vol.  In
- * the case of an error being returned, at least some of the clusters were not
- * freed.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - This function takes the volume lcn bitmap lock for writing and
- *	      modifies the bitmap contents.
- *	    - The caller must have locked the runlist @rl for reading or
- *	      writing.
- */
-static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
-		const runlist_element *rl)
-{
-	int ret;
-
-	down_write(&vol->lcnbmp_lock);
-	ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
-	up_write(&vol->lcnbmp_lock);
-	return ret;
-}
-
-#endif /* NTFS_RW */
-
-#endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
deleted file mode 100644
index 6ce60ffc6ac0..000000000000
--- a/fs/ntfs/logfile.c
+++ /dev/null
@@ -1,849 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2002-2007 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/buffer_head.h>
-#include <linux/bitops.h>
-#include <linux/log2.h>
-#include <linux/bio.h>
-
-#include "attrib.h"
-#include "aops.h"
-#include "debug.h"
-#include "logfile.h"
-#include "malloc.h"
-#include "volume.h"
-#include "ntfs.h"
-
-/**
- * ntfs_check_restart_page_header - check the page header for consistency
- * @vi:		$LogFile inode to which the restart page header belongs
- * @rp:		restart page header to check
- * @pos:	position in @vi at which the restart page header resides
- *
- * Check the restart page header @rp for consistency and return 'true' if it is
- * consistent and 'false' otherwise.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- */
-static bool ntfs_check_restart_page_header(struct inode *vi,
-		RESTART_PAGE_HEADER *rp, s64 pos)
-{
-	u32 logfile_system_page_size, logfile_log_page_size;
-	u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
-	bool have_usa = true;
-
-	ntfs_debug("Entering.");
-	/*
-	 * If the system or log page sizes are smaller than the ntfs block size
-	 * or either is not a power of 2 we cannot handle this log file.
-	 */
-	logfile_system_page_size = le32_to_cpu(rp->system_page_size);
-	logfile_log_page_size = le32_to_cpu(rp->log_page_size);
-	if (logfile_system_page_size < NTFS_BLOCK_SIZE ||
-			logfile_log_page_size < NTFS_BLOCK_SIZE ||
-			logfile_system_page_size &
-			(logfile_system_page_size - 1) ||
-			!is_power_of_2(logfile_log_page_size)) {
-		ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
-		return false;
-	}
-	/*
-	 * We must be either at !pos (1st restart page) or at pos = system page
-	 * size (2nd restart page).
-	 */
-	if (pos && pos != logfile_system_page_size) {
-		ntfs_error(vi->i_sb, "Found restart area in incorrect "
-				"position in $LogFile.");
-		return false;
-	}
-	/* We only know how to handle version 1.1. */
-	if (sle16_to_cpu(rp->major_ver) != 1 ||
-			sle16_to_cpu(rp->minor_ver) != 1) {
-		ntfs_error(vi->i_sb, "$LogFile version %i.%i is not "
-				"supported.  (This driver supports version "
-				"1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
-				(int)sle16_to_cpu(rp->minor_ver));
-		return false;
-	}
-	/*
-	 * If chkdsk has been run the restart page may not be protected by an
-	 * update sequence array.
-	 */
-	if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
-		have_usa = false;
-		goto skip_usa_checks;
-	}
-	/* Verify the size of the update sequence array. */
-	usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
-	if (usa_count != le16_to_cpu(rp->usa_count)) {
-		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
-				"inconsistent update sequence array count.");
-		return false;
-	}
-	/* Verify the position of the update sequence array. */
-	usa_ofs = le16_to_cpu(rp->usa_ofs);
-	usa_end = usa_ofs + usa_count * sizeof(u16);
-	if (usa_ofs < sizeof(RESTART_PAGE_HEADER) ||
-			usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
-		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
-				"inconsistent update sequence array offset.");
-		return false;
-	}
-skip_usa_checks:
-	/*
-	 * Verify the position of the restart area.  It must be:
-	 *	- aligned to 8-byte boundary,
-	 *	- after the update sequence array, and
-	 *	- within the system page size.
-	 */
-	ra_ofs = le16_to_cpu(rp->restart_area_offset);
-	if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end :
-			ra_ofs < sizeof(RESTART_PAGE_HEADER)) ||
-			ra_ofs > logfile_system_page_size) {
-		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
-				"inconsistent restart area offset.");
-		return false;
-	}
-	/*
-	 * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
-	 * set.
-	 */
-	if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
-		ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
-				"by chkdsk but a chkdsk LSN is specified.");
-		return false;
-	}
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * ntfs_check_restart_area - check the restart area for consistency
- * @vi:		$LogFile inode to which the restart page belongs
- * @rp:		restart page whose restart area to check
- *
- * Check the restart area of the restart page @rp for consistency and return
- * 'true' if it is consistent and 'false' otherwise.
- *
- * This function assumes that the restart page header has already been
- * consistency checked.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- */
-static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
-{
-	u64 file_size;
-	RESTART_AREA *ra;
-	u16 ra_ofs, ra_len, ca_ofs;
-	u8 fs_bits;
-
-	ntfs_debug("Entering.");
-	ra_ofs = le16_to_cpu(rp->restart_area_offset);
-	ra = (RESTART_AREA*)((u8*)rp + ra_ofs);
-	/*
-	 * Everything before ra->file_size must be before the first word
-	 * protected by an update sequence number.  This ensures that it is
-	 * safe to access ra->client_array_offset.
-	 */
-	if (ra_ofs + offsetof(RESTART_AREA, file_size) >
-			NTFS_BLOCK_SIZE - sizeof(u16)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent file offset.");
-		return false;
-	}
-	/*
-	 * Now that we can access ra->client_array_offset, make sure everything
-	 * up to the log client array is before the first word protected by an
-	 * update sequence number.  This ensures we can access all of the
-	 * restart area elements safely.  Also, the client array offset must be
-	 * aligned to an 8-byte boundary.
-	 */
-	ca_ofs = le16_to_cpu(ra->client_array_offset);
-	if (((ca_ofs + 7) & ~7) != ca_ofs ||
-			ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent client array offset.");
-		return false;
-	}
-	/*
-	 * The restart area must end within the system page size both when
-	 * calculated manually and as specified by ra->restart_area_length.
-	 * Also, the calculated length must not exceed the specified length.
-	 */
-	ra_len = ca_ofs + le16_to_cpu(ra->log_clients) *
-			sizeof(LOG_CLIENT_RECORD);
-	if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) ||
-			ra_ofs + le16_to_cpu(ra->restart_area_length) >
-			le32_to_cpu(rp->system_page_size) ||
-			ra_len > le16_to_cpu(ra->restart_area_length)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds "
-				"of the system page size specified by the "
-				"restart page header and/or the specified "
-				"restart area length is inconsistent.");
-		return false;
-	}
-	/*
-	 * The ra->client_free_list and ra->client_in_use_list must be either
-	 * LOGFILE_NO_CLIENT or less than ra->log_clients or they are
-	 * overflowing the client array.
-	 */
-	if ((ra->client_free_list != LOGFILE_NO_CLIENT &&
-			le16_to_cpu(ra->client_free_list) >=
-			le16_to_cpu(ra->log_clients)) ||
-			(ra->client_in_use_list != LOGFILE_NO_CLIENT &&
-			le16_to_cpu(ra->client_in_use_list) >=
-			le16_to_cpu(ra->log_clients))) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"overflowing client free and/or in use lists.");
-		return false;
-	}
-	/*
-	 * Check ra->seq_number_bits against ra->file_size for consistency.
-	 * We cannot just use ffs() because the file size is not a power of 2.
-	 */
-	file_size = (u64)sle64_to_cpu(ra->file_size);
-	fs_bits = 0;
-	while (file_size) {
-		file_size >>= 1;
-		fs_bits++;
-	}
-	if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent sequence number bits.");
-		return false;
-	}
-	/* The log record header length must be a multiple of 8. */
-	if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
-			le16_to_cpu(ra->log_record_header_length)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent log record header length.");
-		return false;
-	}
-	/* Dito for the log page data offset. */
-	if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
-			le16_to_cpu(ra->log_page_data_offset)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent log page data offset.");
-		return false;
-	}
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * ntfs_check_log_client_array - check the log client array for consistency
- * @vi:		$LogFile inode to which the restart page belongs
- * @rp:		restart page whose log client array to check
- *
- * Check the log client array of the restart page @rp for consistency and
- * return 'true' if it is consistent and 'false' otherwise.
- *
- * This function assumes that the restart page header and the restart area have
- * already been consistency checked.
- *
- * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this
- * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
- * restart page and the page must be multi sector transfer deprotected.
- */
-static bool ntfs_check_log_client_array(struct inode *vi,
-		RESTART_PAGE_HEADER *rp)
-{
-	RESTART_AREA *ra;
-	LOG_CLIENT_RECORD *ca, *cr;
-	u16 nr_clients, idx;
-	bool in_free_list, idx_is_first;
-
-	ntfs_debug("Entering.");
-	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
-	ca = (LOG_CLIENT_RECORD*)((u8*)ra +
-			le16_to_cpu(ra->client_array_offset));
-	/*
-	 * Check the ra->client_free_list first and then check the
-	 * ra->client_in_use_list.  Check each of the log client records in
-	 * each of the lists and check that the array does not overflow the
-	 * ra->log_clients value.  Also keep track of the number of records
-	 * visited as there cannot be more than ra->log_clients records and
-	 * that way we detect eventual loops in within a list.
-	 */
-	nr_clients = le16_to_cpu(ra->log_clients);
-	idx = le16_to_cpu(ra->client_free_list);
-	in_free_list = true;
-check_list:
-	for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
-			idx = le16_to_cpu(cr->next_client)) {
-		if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
-			goto err_out;
-		/* Set @cr to the current log client record. */
-		cr = ca + idx;
-		/* The first log client record must not have a prev_client. */
-		if (idx_is_first) {
-			if (cr->prev_client != LOGFILE_NO_CLIENT)
-				goto err_out;
-			idx_is_first = false;
-		}
-	}
-	/* Switch to and check the in use list if we just did the free list. */
-	if (in_free_list) {
-		in_free_list = false;
-		idx = le16_to_cpu(ra->client_in_use_list);
-		goto check_list;
-	}
-	ntfs_debug("Done.");
-	return true;
-err_out:
-	ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
-	return false;
-}
-
-/**
- * ntfs_check_and_load_restart_page - check the restart page for consistency
- * @vi:		$LogFile inode to which the restart page belongs
- * @rp:		restart page to check
- * @pos:	position in @vi at which the restart page resides
- * @wrp:	[OUT] copy of the multi sector transfer deprotected restart page
- * @lsn:	[OUT] set to the current logfile lsn on success
- *
- * Check the restart page @rp for consistency and return 0 if it is consistent
- * and -errno otherwise.  The restart page may have been modified by chkdsk in
- * which case its magic is CHKD instead of RSTR.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- *
- * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
- * copy of the complete multi sector transfer deprotected page.  On failure,
- * *@wrp is undefined.
- *
- * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
- * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
- *
- * The following error codes are defined:
- *	-EINVAL	- The restart page is inconsistent.
- *	-ENOMEM	- Not enough memory to load the restart page.
- *	-EIO	- Failed to reading from $LogFile.
- */
-static int ntfs_check_and_load_restart_page(struct inode *vi,
-		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
-		LSN *lsn)
-{
-	RESTART_AREA *ra;
-	RESTART_PAGE_HEADER *trp;
-	int size, err;
-
-	ntfs_debug("Entering.");
-	/* Check the restart page header for consistency. */
-	if (!ntfs_check_restart_page_header(vi, rp, pos)) {
-		/* Error output already done inside the function. */
-		return -EINVAL;
-	}
-	/* Check the restart area for consistency. */
-	if (!ntfs_check_restart_area(vi, rp)) {
-		/* Error output already done inside the function. */
-		return -EINVAL;
-	}
-	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
-	/*
-	 * Allocate a buffer to store the whole restart page so we can multi
-	 * sector transfer deprotect it.
-	 */
-	trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size));
-	if (!trp) {
-		ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
-				"restart page buffer.");
-		return -ENOMEM;
-	}
-	/*
-	 * Read the whole of the restart page into the buffer.  If it fits
-	 * completely inside @rp, just copy it from there.  Otherwise map all
-	 * the required pages and copy the data from them.
-	 */
-	size = PAGE_SIZE - (pos & ~PAGE_MASK);
-	if (size >= le32_to_cpu(rp->system_page_size)) {
-		memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
-	} else {
-		pgoff_t idx;
-		struct page *page;
-		int have_read, to_read;
-
-		/* First copy what we already have in @rp. */
-		memcpy(trp, rp, size);
-		/* Copy the remaining data one page at a time. */
-		have_read = size;
-		to_read = le32_to_cpu(rp->system_page_size) - size;
-		idx = (pos + size) >> PAGE_SHIFT;
-		BUG_ON((pos + size) & ~PAGE_MASK);
-		do {
-			page = ntfs_map_page(vi->i_mapping, idx);
-			if (IS_ERR(page)) {
-				ntfs_error(vi->i_sb, "Error mapping $LogFile "
-						"page (index %lu).", idx);
-				err = PTR_ERR(page);
-				if (err != -EIO && err != -ENOMEM)
-					err = -EIO;
-				goto err_out;
-			}
-			size = min_t(int, to_read, PAGE_SIZE);
-			memcpy((u8*)trp + have_read, page_address(page), size);
-			ntfs_unmap_page(page);
-			have_read += size;
-			to_read -= size;
-			idx++;
-		} while (to_read > 0);
-	}
-	/*
-	 * Perform the multi sector transfer deprotection on the buffer if the
-	 * restart page is protected.
-	 */
-	if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count))
-			&& post_read_mst_fixup((NTFS_RECORD*)trp,
-			le32_to_cpu(rp->system_page_size))) {
-		/*
-		 * A multi sector tranfer error was detected.  We only need to
-		 * abort if the restart page contents exceed the multi sector
-		 * transfer fixup of the first sector.
-		 */
-		if (le16_to_cpu(rp->restart_area_offset) +
-				le16_to_cpu(ra->restart_area_length) >
-				NTFS_BLOCK_SIZE - sizeof(u16)) {
-			ntfs_error(vi->i_sb, "Multi sector transfer error "
-					"detected in $LogFile restart page.");
-			err = -EINVAL;
-			goto err_out;
-		}
-	}
-	/*
-	 * If the restart page is modified by chkdsk or there are no active
-	 * logfile clients, the logfile is consistent.  Otherwise, need to
-	 * check the log client records for consistency, too.
-	 */
-	err = 0;
-	if (ntfs_is_rstr_record(rp->magic) &&
-			ra->client_in_use_list != LOGFILE_NO_CLIENT) {
-		if (!ntfs_check_log_client_array(vi, trp)) {
-			err = -EINVAL;
-			goto err_out;
-		}
-	}
-	if (lsn) {
-		if (ntfs_is_rstr_record(rp->magic))
-			*lsn = sle64_to_cpu(ra->current_lsn);
-		else /* if (ntfs_is_chkd_record(rp->magic)) */
-			*lsn = sle64_to_cpu(rp->chkdsk_lsn);
-	}
-	ntfs_debug("Done.");
-	if (wrp)
-		*wrp = trp;
-	else {
-err_out:
-		ntfs_free(trp);
-	}
-	return err;
-}
-
-/**
- * ntfs_check_logfile - check the journal for consistency
- * @log_vi:	struct inode of loaded journal $LogFile to check
- * @rp:		[OUT] on success this is a copy of the current restart page
- *
- * Check the $LogFile journal for consistency and return 'true' if it is
- * consistent and 'false' if not.  On success, the current restart page is
- * returned in *@rp.  Caller must call ntfs_free(*@rp) when finished with it.
- *
- * At present we only check the two restart pages and ignore the log record
- * pages.
- *
- * Note that the MstProtected flag is not set on the $LogFile inode and hence
- * when reading pages they are not deprotected.  This is because we do not know
- * if the $LogFile was created on a system with a different page size to ours
- * yet and mst deprotection would fail if our page size is smaller.
- */
-bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
-{
-	s64 size, pos;
-	LSN rstr1_lsn, rstr2_lsn;
-	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
-	struct address_space *mapping = log_vi->i_mapping;
-	struct page *page = NULL;
-	u8 *kaddr = NULL;
-	RESTART_PAGE_HEADER *rstr1_ph = NULL;
-	RESTART_PAGE_HEADER *rstr2_ph = NULL;
-	int log_page_size, err;
-	bool logfile_is_empty = true;
-	u8 log_page_bits;
-
-	ntfs_debug("Entering.");
-	/* An empty $LogFile must have been clean before it got emptied. */
-	if (NVolLogFileEmpty(vol))
-		goto is_empty;
-	size = i_size_read(log_vi);
-	/* Make sure the file doesn't exceed the maximum allowed size. */
-	if (size > MaxLogFileSize)
-		size = MaxLogFileSize;
-	/*
-	 * Truncate size to a multiple of the page cache size or the default
-	 * log page size if the page cache size is between the default log page
-	 * log page size if the page cache size is between the default log page
-	 * size and twice that.
-	 */
-	if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <=
-			DefaultLogPageSize * 2)
-		log_page_size = DefaultLogPageSize;
-	else
-		log_page_size = PAGE_SIZE;
-	/*
-	 * Use ntfs_ffs() instead of ffs() to enable the compiler to
-	 * optimize log_page_size and log_page_bits into constants.
-	 */
-	log_page_bits = ntfs_ffs(log_page_size) - 1;
-	size &= ~(s64)(log_page_size - 1);
-	/*
-	 * Ensure the log file is big enough to store at least the two restart
-	 * pages and the minimum number of log record pages.
-	 */
-	if (size < log_page_size * 2 || (size - log_page_size * 2) >>
-			log_page_bits < MinLogRecordPages) {
-		ntfs_error(vol->sb, "$LogFile is too small.");
-		return false;
-	}
-	/*
-	 * Read through the file looking for a restart page.  Since the restart
-	 * page header is at the beginning of a page we only need to search at
-	 * what could be the beginning of a page (for each page size) rather
-	 * than scanning the whole file byte by byte.  If all potential places
-	 * contain empty and uninitialzed records, the log file can be assumed
-	 * to be empty.
-	 */
-	for (pos = 0; pos < size; pos <<= 1) {
-		pgoff_t idx = pos >> PAGE_SHIFT;
-		if (!page || page->index != idx) {
-			if (page)
-				ntfs_unmap_page(page);
-			page = ntfs_map_page(mapping, idx);
-			if (IS_ERR(page)) {
-				ntfs_error(vol->sb, "Error mapping $LogFile "
-						"page (index %lu).", idx);
-				goto err_out;
-			}
-		}
-		kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK);
-		/*
-		 * A non-empty block means the logfile is not empty while an
-		 * empty block after a non-empty block has been encountered
-		 * means we are done.
-		 */
-		if (!ntfs_is_empty_recordp((le32*)kaddr))
-			logfile_is_empty = false;
-		else if (!logfile_is_empty)
-			break;
-		/*
-		 * A log record page means there cannot be a restart page after
-		 * this so no need to continue searching.
-		 */
-		if (ntfs_is_rcrd_recordp((le32*)kaddr))
-			break;
-		/* If not a (modified by chkdsk) restart page, continue. */
-		if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
-				!ntfs_is_chkd_recordp((le32*)kaddr)) {
-			if (!pos)
-				pos = NTFS_BLOCK_SIZE >> 1;
-			continue;
-		}
-		/*
-		 * Check the (modified by chkdsk) restart page for consistency
-		 * and get a copy of the complete multi sector transfer
-		 * deprotected restart page.
-		 */
-		err = ntfs_check_and_load_restart_page(log_vi,
-				(RESTART_PAGE_HEADER*)kaddr, pos,
-				!rstr1_ph ? &rstr1_ph : &rstr2_ph,
-				!rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
-		if (!err) {
-			/*
-			 * If we have now found the first (modified by chkdsk)
-			 * restart page, continue looking for the second one.
-			 */
-			if (!pos) {
-				pos = NTFS_BLOCK_SIZE >> 1;
-				continue;
-			}
-			/*
-			 * We have now found the second (modified by chkdsk)
-			 * restart page, so we can stop looking.
-			 */
-			break;
-		}
-		/*
-		 * Error output already done inside the function.  Note, we do
-		 * not abort if the restart page was invalid as we might still
-		 * find a valid one further in the file.
-		 */
-		if (err != -EINVAL) {
-			ntfs_unmap_page(page);
-			goto err_out;
-		}
-		/* Continue looking. */
-		if (!pos)
-			pos = NTFS_BLOCK_SIZE >> 1;
-	}
-	if (page)
-		ntfs_unmap_page(page);
-	if (logfile_is_empty) {
-		NVolSetLogFileEmpty(vol);
-is_empty:
-		ntfs_debug("Done.  ($LogFile is empty.)");
-		return true;
-	}
-	if (!rstr1_ph) {
-		BUG_ON(rstr2_ph);
-		ntfs_error(vol->sb, "Did not find any restart pages in "
-				"$LogFile and it was not empty.");
-		return false;
-	}
-	/* If both restart pages were found, use the more recent one. */
-	if (rstr2_ph) {
-		/*
-		 * If the second restart area is more recent, switch to it.
-		 * Otherwise just throw it away.
-		 */
-		if (rstr2_lsn > rstr1_lsn) {
-			ntfs_debug("Using second restart page as it is more "
-					"recent.");
-			ntfs_free(rstr1_ph);
-			rstr1_ph = rstr2_ph;
-			/* rstr1_lsn = rstr2_lsn; */
-		} else {
-			ntfs_debug("Using first restart page as it is more "
-					"recent.");
-			ntfs_free(rstr2_ph);
-		}
-		rstr2_ph = NULL;
-	}
-	/* All consistency checks passed. */
-	if (rp)
-		*rp = rstr1_ph;
-	else
-		ntfs_free(rstr1_ph);
-	ntfs_debug("Done.");
-	return true;
-err_out:
-	if (rstr1_ph)
-		ntfs_free(rstr1_ph);
-	return false;
-}
-
-/**
- * ntfs_is_logfile_clean - check in the journal if the volume is clean
- * @log_vi:	struct inode of loaded journal $LogFile to check
- * @rp:		copy of the current restart page
- *
- * Analyze the $LogFile journal and return 'true' if it indicates the volume was
- * shutdown cleanly and 'false' if not.
- *
- * At present we only look at the two restart pages and ignore the log record
- * pages.  This is a little bit crude in that there will be a very small number
- * of cases where we think that a volume is dirty when in fact it is clean.
- * This should only affect volumes that have not been shutdown cleanly but did
- * not have any pending, non-check-pointed i/o, i.e. they were completely idle
- * at least for the five seconds preceding the unclean shutdown.
- *
- * This function assumes that the $LogFile journal has already been consistency
- * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
- * is empty this function requires that NVolLogFileEmpty() is true otherwise an
- * empty volume will be reported as dirty.
- */
-bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
-{
-	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
-	RESTART_AREA *ra;
-
-	ntfs_debug("Entering.");
-	/* An empty $LogFile must have been clean before it got emptied. */
-	if (NVolLogFileEmpty(vol)) {
-		ntfs_debug("Done.  ($LogFile is empty.)");
-		return true;
-	}
-	BUG_ON(!rp);
-	if (!ntfs_is_rstr_record(rp->magic) &&
-			!ntfs_is_chkd_record(rp->magic)) {
-		ntfs_error(vol->sb, "Restart page buffer is invalid.  This is "
-				"probably a bug in that the $LogFile should "
-				"have been consistency checked before calling "
-				"this function.");
-		return false;
-	}
-	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
-	/*
-	 * If the $LogFile has active clients, i.e. it is open, and we do not
-	 * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags,
-	 * we assume there was an unclean shutdown.
-	 */
-	if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
-			!(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
-		ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
-		return false;
-	}
-	/* $LogFile indicates a clean shutdown. */
-	ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
-	return true;
-}
-
-/**
- * ntfs_empty_logfile - empty the contents of the $LogFile journal
- * @log_vi:	struct inode of loaded journal $LogFile to empty
- *
- * Empty the contents of the $LogFile journal @log_vi and return 'true' on
- * success and 'false' on error.
- *
- * This function assumes that the $LogFile journal has already been consistency
- * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
- * has been used to ensure that the $LogFile is clean.
- */
-bool ntfs_empty_logfile(struct inode *log_vi)
-{
-	VCN vcn, end_vcn;
-	ntfs_inode *log_ni = NTFS_I(log_vi);
-	ntfs_volume *vol = log_ni->vol;
-	struct super_block *sb = vol->sb;
-	runlist_element *rl;
-	unsigned long flags;
-	unsigned block_size, block_size_bits;
-	int err;
-	bool should_wait = true;
-
-	ntfs_debug("Entering.");
-	if (NVolLogFileEmpty(vol)) {
-		ntfs_debug("Done.");
-		return true;
-	}
-	/*
-	 * We cannot use ntfs_attr_set() because we may be still in the middle
-	 * of a mount operation.  Thus we do the emptying by hand by first
-	 * zapping the page cache pages for the $LogFile/$DATA attribute and
-	 * then emptying each of the buffers in each of the clusters specified
-	 * by the runlist by hand.
-	 */
-	block_size = sb->s_blocksize;
-	block_size_bits = sb->s_blocksize_bits;
-	vcn = 0;
-	read_lock_irqsave(&log_ni->size_lock, flags);
-	end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
-			vol->cluster_size_bits;
-	read_unlock_irqrestore(&log_ni->size_lock, flags);
-	truncate_inode_pages(log_vi->i_mapping, 0);
-	down_write(&log_ni->runlist.lock);
-	rl = log_ni->runlist.rl;
-	if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
-map_vcn:
-		err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
-		if (err) {
-			ntfs_error(sb, "Failed to map runlist fragment (error "
-					"%d).", -err);
-			goto err;
-		}
-		rl = log_ni->runlist.rl;
-		BUG_ON(!rl || vcn < rl->vcn || !rl->length);
-	}
-	/* Seek to the runlist element containing @vcn. */
-	while (rl->length && vcn >= rl[1].vcn)
-		rl++;
-	do {
-		LCN lcn;
-		sector_t block, end_block;
-		s64 len;
-
-		/*
-		 * If this run is not mapped map it now and start again as the
-		 * runlist will have been updated.
-		 */
-		lcn = rl->lcn;
-		if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
-			vcn = rl->vcn;
-			goto map_vcn;
-		}
-		/* If this run is not valid abort with an error. */
-		if (unlikely(!rl->length || lcn < LCN_HOLE))
-			goto rl_err;
-		/* Skip holes. */
-		if (lcn == LCN_HOLE)
-			continue;
-		block = lcn << vol->cluster_size_bits >> block_size_bits;
-		len = rl->length;
-		if (rl[1].vcn > end_vcn)
-			len = end_vcn - rl->vcn;
-		end_block = (lcn + len) << vol->cluster_size_bits >>
-				block_size_bits;
-		/* Iterate over the blocks in the run and empty them. */
-		do {
-			struct buffer_head *bh;
-
-			/* Obtain the buffer, possibly not uptodate. */
-			bh = sb_getblk(sb, block);
-			BUG_ON(!bh);
-			/* Setup buffer i/o submission. */
-			lock_buffer(bh);
-			bh->b_end_io = end_buffer_write_sync;
-			get_bh(bh);
-			/* Set the entire contents of the buffer to 0xff. */
-			memset(bh->b_data, -1, block_size);
-			if (!buffer_uptodate(bh))
-				set_buffer_uptodate(bh);
-			if (buffer_dirty(bh))
-				clear_buffer_dirty(bh);
-			/*
-			 * Submit the buffer and wait for i/o to complete but
-			 * only for the first buffer so we do not miss really
-			 * serious i/o errors.  Once the first buffer has
-			 * completed ignore errors afterwards as we can assume
-			 * that if one buffer worked all of them will work.
-			 */
-			submit_bh(REQ_OP_WRITE, bh);
-			if (should_wait) {
-				should_wait = false;
-				wait_on_buffer(bh);
-				if (unlikely(!buffer_uptodate(bh)))
-					goto io_err;
-			}
-			brelse(bh);
-		} while (++block < end_block);
-	} while ((++rl)->vcn < end_vcn);
-	up_write(&log_ni->runlist.lock);
-	/*
-	 * Zap the pages again just in case any got instantiated whilst we were
-	 * emptying the blocks by hand.  FIXME: We may not have completed
-	 * writing to all the buffer heads yet so this may happen too early.
-	 * We really should use a kernel thread to do the emptying
-	 * asynchronously and then we can also set the volume dirty and output
-	 * an error message if emptying should fail.
-	 */
-	truncate_inode_pages(log_vi->i_mapping, 0);
-	/* Set the flag so we do not have to do it again on remount. */
-	NVolSetLogFileEmpty(vol);
-	ntfs_debug("Done.");
-	return true;
-io_err:
-	ntfs_error(sb, "Failed to write buffer.  Unmount and run chkdsk.");
-	goto dirty_err;
-rl_err:
-	ntfs_error(sb, "Runlist is corrupt.  Unmount and run chkdsk.");
-dirty_err:
-	NVolSetErrors(vol);
-	err = -EIO;
-err:
-	up_write(&log_ni->runlist.lock);
-	ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
-			-err);
-	return false;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
deleted file mode 100644
index 429d4909cc72..000000000000
--- a/fs/ntfs/logfile.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
- *	       the Linux-NTFS project.
- *
- * Copyright (c) 2000-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_LOGFILE_H
-#define _LINUX_NTFS_LOGFILE_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "types.h"
-#include "endian.h"
-#include "layout.h"
-
-/*
- * Journal ($LogFile) organization:
- *
- * Two restart areas present in the first two pages (restart pages, one restart
- * area in each page).  When the volume is dismounted they should be identical,
- * except for the update sequence array which usually has a different update
- * sequence number.
- *
- * These are followed by log records organized in pages headed by a log record
- * header going up to log file size.  Not all pages contain log records when a
- * volume is first formatted, but as the volume ages, all records will be used.
- * When the log file fills up, the records at the beginning are purged (by
- * modifying the oldest_lsn to a higher value presumably) and writing begins
- * at the beginning of the file.  Effectively, the log file is viewed as a
- * circular entity.
- *
- * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept
- * versions <= 1.x, including 0.-1.  (Yes, that is a minus one in there!)  We
- * probably only want to support 1.1 as this seems to be the current version
- * and we don't know how that differs from the older versions.  The only
- * exception is if the journal is clean as marked by the two restart pages
- * then it doesn't matter whether we are on an earlier version.  We can just
- * reinitialize the logfile and start again with version 1.1.
- */
-
-/* Some $LogFile related constants. */
-#define MaxLogFileSize		0x100000000ULL
-#define DefaultLogPageSize	4096
-#define MinLogRecordPages	48
-
-/*
- * Log file restart page header (begins the restart area).
- */
-typedef struct {
-/*Ofs*/
-/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-/*  0*/	NTFS_RECORD_TYPE magic;	/* The magic is "RSTR". */
-/*  4*/	le16 usa_ofs;		/* See NTFS_RECORD definition in layout.h.
-				   When creating, set this to be immediately
-				   after this header structure (without any
-				   alignment). */
-/*  6*/	le16 usa_count;		/* See NTFS_RECORD definition in layout.h. */
-
-/*  8*/	leLSN chkdsk_lsn;	/* The last log file sequence number found by
-				   chkdsk.  Only used when the magic is changed
-				   to "CHKD".  Otherwise this is zero. */
-/* 16*/	le32 system_page_size;	/* Byte size of system pages when the log file
-				   was created, has to be >= 512 and a power of
-				   2.  Use this to calculate the required size
-				   of the usa (usa_count) and add it to usa_ofs.
-				   Then verify that the result is less than the
-				   value of the restart_area_offset. */
-/* 20*/	le32 log_page_size;	/* Byte size of log file pages, has to be >=
-				   512 and a power of 2.  The default is 4096
-				   and is used when the system page size is
-				   between 4096 and 8192.  Otherwise this is
-				   set to the system page size instead. */
-/* 24*/	le16 restart_area_offset;/* Byte offset from the start of this header to
-				   the RESTART_AREA.  Value has to be aligned
-				   to 8-byte boundary.  When creating, set this
-				   to be after the usa. */
-/* 26*/	sle16 minor_ver;	/* Log file minor version.  Only check if major
-				   version is 1. */
-/* 28*/	sle16 major_ver;	/* Log file major version.  We only support
-				   version 1.1. */
-/* sizeof() = 30 (0x1e) bytes */
-} __attribute__ ((__packed__)) RESTART_PAGE_HEADER;
-
-/*
- * Constant for the log client indices meaning that there are no client records
- * in this particular client array.  Also inside the client records themselves,
- * this means that there are no client records preceding or following this one.
- */
-#define LOGFILE_NO_CLIENT	cpu_to_le16(0xffff)
-#define LOGFILE_NO_CLIENT_CPU	0xffff
-
-/*
- * These are the so far known RESTART_AREA_* flags (16-bit) which contain
- * information about the log file in which they are present.
- */
-enum {
-	RESTART_VOLUME_IS_CLEAN	= cpu_to_le16(0x0002),
-	RESTART_SPACE_FILLER	= cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
-} __attribute__ ((__packed__));
-
-typedef le16 RESTART_AREA_FLAGS;
-
-/*
- * Log file restart area record.  The offset of this record is found by adding
- * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found
- * in it.  See notes at restart_area_offset above.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	leLSN current_lsn;	/* The current, i.e. last LSN inside the log
-				   when the restart area was last written.
-				   This happens often but what is the interval?
-				   Is it just fixed time or is it every time a
-				   check point is written or somethine else?
-				   On create set to 0. */
-/*  8*/	le16 log_clients;	/* Number of log client records in the array of
-				   log client records which follows this
-				   restart area.  Must be 1.  */
-/* 10*/	le16 client_free_list;	/* The index of the first free log client record
-				   in the array of log client records.
-				   LOGFILE_NO_CLIENT means that there are no
-				   free log client records in the array.
-				   If != LOGFILE_NO_CLIENT, check that
-				   log_clients > client_free_list.  On Win2k
-				   and presumably earlier, on a clean volume
-				   this is != LOGFILE_NO_CLIENT, and it should
-				   be 0, i.e. the first (and only) client
-				   record is free and thus the logfile is
-				   closed and hence clean.  A dirty volume
-				   would have left the logfile open and hence
-				   this would be LOGFILE_NO_CLIENT.  On WinXP
-				   and presumably later, the logfile is always
-				   open, even on clean shutdown so this should
-				   always be LOGFILE_NO_CLIENT. */
-/* 12*/	le16 client_in_use_list;/* The index of the first in-use log client
-				   record in the array of log client records.
-				   LOGFILE_NO_CLIENT means that there are no
-				   in-use log client records in the array.  If
-				   != LOGFILE_NO_CLIENT check that log_clients
-				   > client_in_use_list.  On Win2k and
-				   presumably earlier, on a clean volume this
-				   is LOGFILE_NO_CLIENT, i.e. there are no
-				   client records in use and thus the logfile
-				   is closed and hence clean.  A dirty volume
-				   would have left the logfile open and hence
-				   this would be != LOGFILE_NO_CLIENT, and it
-				   should be 0, i.e. the first (and only)
-				   client record is in use.  On WinXP and
-				   presumably later, the logfile is always
-				   open, even on clean shutdown so this should
-				   always be 0. */
-/* 14*/	RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour.  On Win2k
-				   and presumably earlier this is always 0.  On
-				   WinXP and presumably later, if the logfile
-				   was shutdown cleanly, the second bit,
-				   RESTART_VOLUME_IS_CLEAN, is set.  This bit
-				   is cleared when the volume is mounted by
-				   WinXP and set when the volume is dismounted,
-				   thus if the logfile is dirty, this bit is
-				   clear.  Thus we don't need to check the
-				   Windows version to determine if the logfile
-				   is clean.  Instead if the logfile is closed,
-				   we know it must be clean.  If it is open and
-				   this bit is set, we also know it must be
-				   clean.  If on the other hand the logfile is
-				   open and this bit is clear, we can be almost
-				   certain that the logfile is dirty. */
-/* 16*/	le32 seq_number_bits;	/* How many bits to use for the sequence
-				   number.  This is calculated as 67 - the
-				   number of bits required to store the logfile
-				   size in bytes and this can be used in with
-				   the specified file_size as a consistency
-				   check. */
-/* 20*/	le16 restart_area_length;/* Length of the restart area including the
-				   client array.  Following checks required if
-				   version matches.  Otherwise, skip them.
-				   restart_area_offset + restart_area_length
-				   has to be <= system_page_size.  Also,
-				   restart_area_length has to be >=
-				   client_array_offset + (log_clients *
-				   sizeof(log client record)). */
-/* 22*/	le16 client_array_offset;/* Offset from the start of this record to
-				   the first log client record if versions are
-				   matched.  When creating, set this to be
-				   after this restart area structure, aligned
-				   to 8-bytes boundary.  If the versions do not
-				   match, this is ignored and the offset is
-				   assumed to be (sizeof(RESTART_AREA) + 7) &
-				   ~7, i.e. rounded up to first 8-byte
-				   boundary.  Either way, client_array_offset
-				   has to be aligned to an 8-byte boundary.
-				   Also, restart_area_offset +
-				   client_array_offset has to be <= 510.
-				   Finally, client_array_offset + (log_clients
-				   * sizeof(log client record)) has to be <=
-				   system_page_size.  On Win2k and presumably
-				   earlier, this is 0x30, i.e. immediately
-				   following this record.  On WinXP and
-				   presumably later, this is 0x40, i.e. there
-				   are 16 extra bytes between this record and
-				   the client array.  This probably means that
-				   the RESTART_AREA record is actually bigger
-				   in WinXP and later. */
-/* 24*/	sle64 file_size;	/* Usable byte size of the log file.  If the
-				   restart_area_offset + the offset of the
-				   file_size are > 510 then corruption has
-				   occurred.  This is the very first check when
-				   starting with the restart_area as if it
-				   fails it means that some of the above values
-				   will be corrupted by the multi sector
-				   transfer protection.  The file_size has to
-				   be rounded down to be a multiple of the
-				   log_page_size in the RESTART_PAGE_HEADER and
-				   then it has to be at least big enough to
-				   store the two restart pages and 48 (0x30)
-				   log record pages. */
-/* 32*/	le32 last_lsn_data_length;/* Length of data of last LSN, not including
-				   the log record header.  On create set to
-				   0. */
-/* 36*/	le16 log_record_header_length;/* Byte size of the log record header.
-				   If the version matches then check that the
-				   value of log_record_header_length is a
-				   multiple of 8, i.e.
-				   (log_record_header_length + 7) & ~7 ==
-				   log_record_header_length.  When creating set
-				   it to sizeof(LOG_RECORD_HEADER), aligned to
-				   8 bytes. */
-/* 38*/	le16 log_page_data_offset;/* Offset to the start of data in a log record
-				   page.  Must be a multiple of 8.  On create
-				   set it to immediately after the update
-				   sequence array of the log record page. */
-/* 40*/	le32 restart_log_open_count;/* A counter that gets incremented every
-				   time the logfile is restarted which happens
-				   at mount time when the logfile is opened.
-				   When creating set to a random value.  Win2k
-				   sets it to the low 32 bits of the current
-				   system time in NTFS format (see time.h). */
-/* 44*/	le32 reserved;		/* Reserved/alignment to 8-byte boundary. */
-/* sizeof() = 48 (0x30) bytes */
-} __attribute__ ((__packed__)) RESTART_AREA;
-
-/*
- * Log client record.  The offset of this record is found by adding the offset
- * of the RESTART_AREA to the client_array_offset value found in it.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	leLSN oldest_lsn;	/* Oldest LSN needed by this client.  On create
-				   set to 0. */
-/*  8*/	leLSN client_restart_lsn;/* LSN at which this client needs to restart
-				   the volume, i.e. the current position within
-				   the log file.  At present, if clean this
-				   should = current_lsn in restart area but it
-				   probably also = current_lsn when dirty most
-				   of the time.  At create set to 0. */
-/* 16*/	le16 prev_client;	/* The offset to the previous log client record
-				   in the array of log client records.
-				   LOGFILE_NO_CLIENT means there is no previous
-				   client record, i.e. this is the first one.
-				   This is always LOGFILE_NO_CLIENT. */
-/* 18*/	le16 next_client;	/* The offset to the next log client record in
-				   the array of log client records.
-				   LOGFILE_NO_CLIENT means there are no next
-				   client records, i.e. this is the last one.
-				   This is always LOGFILE_NO_CLIENT. */
-/* 20*/	le16 seq_number;	/* On Win2k and presumably earlier, this is set
-				   to zero every time the logfile is restarted
-				   and it is incremented when the logfile is
-				   closed at dismount time.  Thus it is 0 when
-				   dirty and 1 when clean.  On WinXP and
-				   presumably later, this is always 0. */
-/* 22*/	u8 reserved[6];		/* Reserved/alignment. */
-/* 28*/	le32 client_name_length;/* Length of client name in bytes.  Should
-				   always be 8. */
-/* 32*/	ntfschar client_name[64];/* Name of the client in Unicode.  Should
-				   always be "NTFS" with the remaining bytes
-				   set to 0. */
-/* sizeof() = 160 (0xa0) bytes */
-} __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
-
-extern bool ntfs_check_logfile(struct inode *log_vi,
-		RESTART_PAGE_HEADER **rp);
-
-extern bool ntfs_is_logfile_clean(struct inode *log_vi,
-		const RESTART_PAGE_HEADER *rp);
-
-extern bool ntfs_empty_logfile(struct inode *log_vi);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_LOGFILE_H */
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
deleted file mode 100644
index 7068425735f1..000000000000
--- a/fs/ntfs/malloc.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_MALLOC_H
-#define _LINUX_NTFS_MALLOC_H
-
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-/**
- * __ntfs_malloc - allocate memory in multiples of pages
- * @size:	number of bytes to allocate
- * @gfp_mask:	extra flags for the allocator
- *
- * Internal function.  You probably want ntfs_malloc_nofs()...
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * If there was insufficient memory to complete the request, return NULL.
- * Depending on @gfp_mask the allocation may be guaranteed to succeed.
- */
-static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
-{
-	if (likely(size <= PAGE_SIZE)) {
-		BUG_ON(!size);
-		/* kmalloc() has per-CPU caches so is faster for now. */
-		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
-		/* return (void *)__get_free_page(gfp_mask); */
-	}
-	if (likely((size >> PAGE_SHIFT) < totalram_pages()))
-		return __vmalloc(size, gfp_mask);
-	return NULL;
-}
-
-/**
- * ntfs_malloc_nofs - allocate memory in multiples of pages
- * @size:	number of bytes to allocate
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * If there was insufficient memory to complete the request, return NULL.
- */
-static inline void *ntfs_malloc_nofs(unsigned long size)
-{
-	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
-}
-
-/**
- * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
- * @size:	number of bytes to allocate
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * This function guarantees that the allocation will succeed.  It will sleep
- * for as long as it takes to complete the allocation.
- *
- * If there was insufficient memory to complete the request, return NULL.
- */
-static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
-{
-	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
-}
-
-static inline void ntfs_free(void *addr)
-{
-	kvfree(addr);
-}
-
-#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
deleted file mode 100644
index 6fd1dc4b08c8..000000000000
--- a/fs/ntfs/mft.c
+++ /dev/null
@@ -1,2907 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/bio.h>
-
-#include "attrib.h"
-#include "aops.h"
-#include "bitmap.h"
-#include "debug.h"
-#include "dir.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-
-#define MAX_BHS	(PAGE_SIZE / NTFS_BLOCK_SIZE)
-
-/**
- * map_mft_record_page - map the page in which a specific mft record resides
- * @ni:		ntfs inode whose mft record page to map
- *
- * This maps the page in which the mft record of the ntfs inode @ni is situated
- * and returns a pointer to the mft record within the mapped page.
- *
- * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
- * contains the negative error code returned.
- */
-static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
-{
-	loff_t i_size;
-	ntfs_volume *vol = ni->vol;
-	struct inode *mft_vi = vol->mft_ino;
-	struct page *page;
-	unsigned long index, end_index;
-	unsigned ofs;
-
-	BUG_ON(ni->page);
-	/*
-	 * The index into the page cache and the offset within the page cache
-	 * page of the wanted mft record. FIXME: We need to check for
-	 * overflowing the unsigned long, but I don't think we would ever get
-	 * here if the volume was that big...
-	 */
-	index = (u64)ni->mft_no << vol->mft_record_size_bits >>
-			PAGE_SHIFT;
-	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
-
-	i_size = i_size_read(mft_vi);
-	/* The maximum valid index into the page cache for $MFT's data. */
-	end_index = i_size >> PAGE_SHIFT;
-
-	/* If the wanted index is out of bounds the mft record doesn't exist. */
-	if (unlikely(index >= end_index)) {
-		if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
-				vol->mft_record_size) {
-			page = ERR_PTR(-ENOENT);
-			ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
-					"which is beyond the end of the mft.  "
-					"This is probably a bug in the ntfs "
-					"driver.", ni->mft_no);
-			goto err_out;
-		}
-	}
-	/* Read, map, and pin the page. */
-	page = ntfs_map_page(mft_vi->i_mapping, index);
-	if (!IS_ERR(page)) {
-		/* Catch multi sector transfer fixup errors. */
-		if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
-				ofs)))) {
-			ni->page = page;
-			ni->page_ofs = ofs;
-			return page_address(page) + ofs;
-		}
-		ntfs_error(vol->sb, "Mft record 0x%lx is corrupt.  "
-				"Run chkdsk.", ni->mft_no);
-		ntfs_unmap_page(page);
-		page = ERR_PTR(-EIO);
-		NVolSetErrors(vol);
-	}
-err_out:
-	ni->page = NULL;
-	ni->page_ofs = 0;
-	return (void*)page;
-}
-
-/**
- * map_mft_record - map, pin and lock an mft record
- * @ni:		ntfs inode whose MFT record to map
- *
- * First, take the mrec_lock mutex.  We might now be sleeping, while waiting
- * for the mutex if it was already locked by someone else.
- *
- * The page of the record is mapped using map_mft_record_page() before being
- * returned to the caller.
- *
- * This in turn uses ntfs_map_page() to get the page containing the wanted mft
- * record (it in turn calls read_cache_page() which reads it in from disk if
- * necessary, increments the use count on the page so that it cannot disappear
- * under us and returns a reference to the page cache page).
- *
- * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
- * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
- * and the post-read mst fixups on each mft record in the page have been
- * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
- * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
- * ntfs_map_page() waits for PG_locked to become clear and checks if
- * PG_uptodate is set and returns an error code if not. This provides
- * sufficient protection against races when reading/using the page.
- *
- * However there is the write mapping to think about. Doing the above described
- * checking here will be fine, because when initiating the write we will set
- * PG_locked and clear PG_uptodate making sure nobody is touching the page
- * contents. Doing the locking this way means that the commit to disk code in
- * the page cache code paths is automatically sufficiently locked with us as
- * we will not touch a page that has been locked or is not uptodate. The only
- * locking problem then is them locking the page while we are accessing it.
- *
- * So that code will end up having to own the mrec_lock of all mft
- * records/inodes present in the page before I/O can proceed. In that case we
- * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
- * accessing anything without owning the mrec_lock mutex.  But we do need to
- * use them because of the read_cache_page() invocation and the code becomes so
- * much simpler this way that it is well worth it.
- *
- * The mft record is now ours and we return a pointer to it. You need to check
- * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
- * the error code.
- *
- * NOTE: Caller is responsible for setting the mft record dirty before calling
- * unmap_mft_record(). This is obviously only necessary if the caller really
- * modified the mft record...
- * Q: Do we want to recycle one of the VFS inode state bits instead?
- * A: No, the inode ones mean we want to change the mft record, not we want to
- * write it out.
- */
-MFT_RECORD *map_mft_record(ntfs_inode *ni)
-{
-	MFT_RECORD *m;
-
-	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
-
-	/* Make sure the ntfs inode doesn't go away. */
-	atomic_inc(&ni->count);
-
-	/* Serialize access to this mft record. */
-	mutex_lock(&ni->mrec_lock);
-
-	m = map_mft_record_page(ni);
-	if (!IS_ERR(m))
-		return m;
-
-	mutex_unlock(&ni->mrec_lock);
-	atomic_dec(&ni->count);
-	ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
-	return m;
-}
-
-/**
- * unmap_mft_record_page - unmap the page in which a specific mft record resides
- * @ni:		ntfs inode whose mft record page to unmap
- *
- * This unmaps the page in which the mft record of the ntfs inode @ni is
- * situated and returns. This is a NOOP if highmem is not configured.
- *
- * The unmap happens via ntfs_unmap_page() which in turn decrements the use
- * count on the page thus releasing it from the pinned state.
- *
- * We do not actually unmap the page from memory of course, as that will be
- * done by the page cache code itself when memory pressure increases or
- * whatever.
- */
-static inline void unmap_mft_record_page(ntfs_inode *ni)
-{
-	BUG_ON(!ni->page);
-
-	// TODO: If dirty, blah...
-	ntfs_unmap_page(ni->page);
-	ni->page = NULL;
-	ni->page_ofs = 0;
-	return;
-}
-
-/**
- * unmap_mft_record - release a mapped mft record
- * @ni:		ntfs inode whose MFT record to unmap
- *
- * We release the page mapping and the mrec_lock mutex which unmaps the mft
- * record and releases it for others to get hold of. We also release the ntfs
- * inode by decrementing the ntfs inode reference count.
- *
- * NOTE: If caller has modified the mft record, it is imperative to set the mft
- * record dirty BEFORE calling unmap_mft_record().
- */
-void unmap_mft_record(ntfs_inode *ni)
-{
-	struct page *page = ni->page;
-
-	BUG_ON(!page);
-
-	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
-
-	unmap_mft_record_page(ni);
-	mutex_unlock(&ni->mrec_lock);
-	atomic_dec(&ni->count);
-	/*
-	 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
-	 * ntfs_clear_extent_inode() in the extent inode case, and to the
-	 * caller in the non-extent, yet pure ntfs inode case, to do the actual
-	 * tear down of all structures and freeing of all allocated memory.
-	 */
-	return;
-}
-
-/**
- * map_extent_mft_record - load an extent inode and attach it to its base
- * @base_ni:	base ntfs inode
- * @mref:	mft reference of the extent inode to load
- * @ntfs_ino:	on successful return, pointer to the ntfs_inode structure
- *
- * Load the extent mft record @mref and attach it to its base inode @base_ni.
- * Return the mapped extent mft record if IS_ERR(result) is false.  Otherwise
- * PTR_ERR(result) gives the negative error code.
- *
- * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
- * structure of the mapped extent inode.
- */
-MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
-		ntfs_inode **ntfs_ino)
-{
-	MFT_RECORD *m;
-	ntfs_inode *ni = NULL;
-	ntfs_inode **extent_nis = NULL;
-	int i;
-	unsigned long mft_no = MREF(mref);
-	u16 seq_no = MSEQNO(mref);
-	bool destroy_ni = false;
-
-	ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
-			mft_no, base_ni->mft_no);
-	/* Make sure the base ntfs inode doesn't go away. */
-	atomic_inc(&base_ni->count);
-	/*
-	 * Check if this extent inode has already been added to the base inode,
-	 * in which case just return it. If not found, add it to the base
-	 * inode before returning it.
-	 */
-	mutex_lock(&base_ni->extent_lock);
-	if (base_ni->nr_extents > 0) {
-		extent_nis = base_ni->ext.extent_ntfs_inos;
-		for (i = 0; i < base_ni->nr_extents; i++) {
-			if (mft_no != extent_nis[i]->mft_no)
-				continue;
-			ni = extent_nis[i];
-			/* Make sure the ntfs inode doesn't go away. */
-			atomic_inc(&ni->count);
-			break;
-		}
-	}
-	if (likely(ni != NULL)) {
-		mutex_unlock(&base_ni->extent_lock);
-		atomic_dec(&base_ni->count);
-		/* We found the record; just have to map and return it. */
-		m = map_mft_record(ni);
-		/* map_mft_record() has incremented this on success. */
-		atomic_dec(&ni->count);
-		if (!IS_ERR(m)) {
-			/* Verify the sequence number. */
-			if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
-				ntfs_debug("Done 1.");
-				*ntfs_ino = ni;
-				return m;
-			}
-			unmap_mft_record(ni);
-			ntfs_error(base_ni->vol->sb, "Found stale extent mft "
-					"reference! Corrupt filesystem. "
-					"Run chkdsk.");
-			return ERR_PTR(-EIO);
-		}
-map_err_out:
-		ntfs_error(base_ni->vol->sb, "Failed to map extent "
-				"mft record, error code %ld.", -PTR_ERR(m));
-		return m;
-	}
-	/* Record wasn't there. Get a new ntfs inode and initialize it. */
-	ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
-	if (unlikely(!ni)) {
-		mutex_unlock(&base_ni->extent_lock);
-		atomic_dec(&base_ni->count);
-		return ERR_PTR(-ENOMEM);
-	}
-	ni->vol = base_ni->vol;
-	ni->seq_no = seq_no;
-	ni->nr_extents = -1;
-	ni->ext.base_ntfs_ino = base_ni;
-	/* Now map the record. */
-	m = map_mft_record(ni);
-	if (IS_ERR(m)) {
-		mutex_unlock(&base_ni->extent_lock);
-		atomic_dec(&base_ni->count);
-		ntfs_clear_extent_inode(ni);
-		goto map_err_out;
-	}
-	/* Verify the sequence number if it is present. */
-	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
-		ntfs_error(base_ni->vol->sb, "Found stale extent mft "
-				"reference! Corrupt filesystem. Run chkdsk.");
-		destroy_ni = true;
-		m = ERR_PTR(-EIO);
-		goto unm_err_out;
-	}
-	/* Attach extent inode to base inode, reallocating memory if needed. */
-	if (!(base_ni->nr_extents & 3)) {
-		ntfs_inode **tmp;
-		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
-
-		tmp = kmalloc(new_size, GFP_NOFS);
-		if (unlikely(!tmp)) {
-			ntfs_error(base_ni->vol->sb, "Failed to allocate "
-					"internal buffer.");
-			destroy_ni = true;
-			m = ERR_PTR(-ENOMEM);
-			goto unm_err_out;
-		}
-		if (base_ni->nr_extents) {
-			BUG_ON(!base_ni->ext.extent_ntfs_inos);
-			memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
-					4 * sizeof(ntfs_inode *));
-			kfree(base_ni->ext.extent_ntfs_inos);
-		}
-		base_ni->ext.extent_ntfs_inos = tmp;
-	}
-	base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
-	mutex_unlock(&base_ni->extent_lock);
-	atomic_dec(&base_ni->count);
-	ntfs_debug("Done 2.");
-	*ntfs_ino = ni;
-	return m;
-unm_err_out:
-	unmap_mft_record(ni);
-	mutex_unlock(&base_ni->extent_lock);
-	atomic_dec(&base_ni->count);
-	/*
-	 * If the extent inode was not attached to the base inode we need to
-	 * release it or we will leak memory.
-	 */
-	if (destroy_ni)
-		ntfs_clear_extent_inode(ni);
-	return m;
-}
-
-#ifdef NTFS_RW
-
-/**
- * __mark_mft_record_dirty - set the mft record and the page containing it dirty
- * @ni:		ntfs inode describing the mapped mft record
- *
- * Internal function.  Users should call mark_mft_record_dirty() instead.
- *
- * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
- * as well as the page containing the mft record, dirty.  Also, mark the base
- * vfs inode dirty.  This ensures that any changes to the mft record are
- * written out to disk.
- *
- * NOTE:  We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
- * on the base vfs inode, because even though file data may have been modified,
- * it is dirty in the inode meta data rather than the data page cache of the
- * inode, and thus there are no data pages that need writing out.  Therefore, a
- * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because ->write_inode needs to be called even
- * in case of fdatasync. This needs to happen or the file data would not
- * necessarily hit the device synchronously, even though the vfs inode has the
- * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
- * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
- * which is not what I_DIRTY_SYNC on its own would suggest.
- */
-void __mark_mft_record_dirty(ntfs_inode *ni)
-{
-	ntfs_inode *base_ni;
-
-	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
-	BUG_ON(NInoAttr(ni));
-	mark_ntfs_record_dirty(ni->page, ni->page_ofs);
-	/* Determine the base vfs inode and mark it dirty, too. */
-	mutex_lock(&ni->extent_lock);
-	if (likely(ni->nr_extents >= 0))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	mutex_unlock(&ni->extent_lock);
-	__mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
-}
-
-static const char *ntfs_please_email = "Please email "
-		"linux-ntfs-dev@lists.sourceforge.net and say that you saw "
-		"this message.  Thank you.";
-
-/**
- * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
- * @vol:	ntfs volume on which the mft record to synchronize resides
- * @mft_no:	mft record number of mft record to synchronize
- * @m:		mapped, mst protected (extent) mft record to synchronize
- *
- * Write the mapped, mst protected (extent) mft record @m with mft record
- * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
- * bypassing the page cache and the $MFTMirr inode itself.
- *
- * This function is only for use at umount time when the mft mirror inode has
- * already been disposed off.  We BUG() if we are called while the mft mirror
- * inode is still attached to the volume.
- *
- * On success return 0.  On error return -errno.
- *
- * NOTE:  This function is not implemented yet as I am not convinced it can
- * actually be triggered considering the sequence of commits we do in super.c::
- * ntfs_put_super().  But just in case we provide this place holder as the
- * alternative would be either to BUG() or to get a NULL pointer dereference
- * and Oops.
- */
-static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
-		const unsigned long mft_no, MFT_RECORD *m)
-{
-	BUG_ON(vol->mftmirr_ino);
-	ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
-			"implemented yet.  %s", ntfs_please_email);
-	return -EOPNOTSUPP;
-}
-
-/**
- * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
- * @vol:	ntfs volume on which the mft record to synchronize resides
- * @mft_no:	mft record number of mft record to synchronize
- * @m:		mapped, mst protected (extent) mft record to synchronize
- * @sync:	if true, wait for i/o completion
- *
- * Write the mapped, mst protected (extent) mft record @m with mft record
- * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
- *
- * On success return 0.  On error return -errno and set the volume errors flag
- * in the ntfs volume @vol.
- *
- * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
- *
- * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
- * schedule i/o via ->writepage or do it via kntfsd or whatever.
- */
-int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
-		MFT_RECORD *m, int sync)
-{
-	struct page *page;
-	unsigned int blocksize = vol->sb->s_blocksize;
-	int max_bhs = vol->mft_record_size / blocksize;
-	struct buffer_head *bhs[MAX_BHS];
-	struct buffer_head *bh, *head;
-	u8 *kmirr;
-	runlist_element *rl;
-	unsigned int block_start, block_end, m_start, m_end, page_ofs;
-	int i_bhs, nr_bhs, err = 0;
-	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
-
-	ntfs_debug("Entering for inode 0x%lx.", mft_no);
-	BUG_ON(!max_bhs);
-	if (WARN_ON(max_bhs > MAX_BHS))
-		return -EINVAL;
-	if (unlikely(!vol->mftmirr_ino)) {
-		/* This could happen during umount... */
-		err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
-		if (likely(!err))
-			return err;
-		goto err_out;
-	}
-	/* Get the page containing the mirror copy of the mft record @m. */
-	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
-			(PAGE_SHIFT - vol->mft_record_size_bits));
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to map mft mirror page.");
-		err = PTR_ERR(page);
-		goto err_out;
-	}
-	lock_page(page);
-	BUG_ON(!PageUptodate(page));
-	ClearPageUptodate(page);
-	/* Offset of the mft mirror record inside the page. */
-	page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
-	/* The address in the page of the mirror copy of the mft record @m. */
-	kmirr = page_address(page) + page_ofs;
-	/* Copy the mst protected mft record to the mirror. */
-	memcpy(kmirr, m, vol->mft_record_size);
-	/* Create uptodate buffers if not present. */
-	if (unlikely(!page_has_buffers(page))) {
-		struct buffer_head *tail;
-
-		bh = head = alloc_page_buffers(page, blocksize, true);
-		do {
-			set_buffer_uptodate(bh);
-			tail = bh;
-			bh = bh->b_this_page;
-		} while (bh);
-		tail->b_this_page = head;
-		attach_page_private(page, head);
-	}
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
-	rl = NULL;
-	nr_bhs = 0;
-	block_start = 0;
-	m_start = kmirr - (u8*)page_address(page);
-	m_end = m_start + vol->mft_record_size;
-	do {
-		block_end = block_start + blocksize;
-		/* If the buffer is outside the mft record, skip it. */
-		if (block_end <= m_start)
-			continue;
-		if (unlikely(block_start >= m_end))
-			break;
-		/* Need to map the buffer if it is not mapped already. */
-		if (unlikely(!buffer_mapped(bh))) {
-			VCN vcn;
-			LCN lcn;
-			unsigned int vcn_ofs;
-
-			bh->b_bdev = vol->sb->s_bdev;
-			/* Obtain the vcn and offset of the current block. */
-			vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
-					(block_start - m_start);
-			vcn_ofs = vcn & vol->cluster_size_mask;
-			vcn >>= vol->cluster_size_bits;
-			if (!rl) {
-				down_read(&NTFS_I(vol->mftmirr_ino)->
-						runlist.lock);
-				rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
-				/*
-				 * $MFTMirr always has the whole of its runlist
-				 * in memory.
-				 */
-				BUG_ON(!rl);
-			}
-			/* Seek to element containing target vcn. */
-			while (rl->length && rl[1].vcn <= vcn)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-			/* For $MFTMirr, only lcn >= 0 is a successful remap. */
-			if (likely(lcn >= 0)) {
-				/* Setup buffer head to correct block. */
-				bh->b_blocknr = ((lcn <<
-						vol->cluster_size_bits) +
-						vcn_ofs) >> blocksize_bits;
-				set_buffer_mapped(bh);
-			} else {
-				bh->b_blocknr = -1;
-				ntfs_error(vol->sb, "Cannot write mft mirror "
-						"record 0x%lx because its "
-						"location on disk could not "
-						"be determined (error code "
-						"%lli).", mft_no,
-						(long long)lcn);
-				err = -EIO;
-			}
-		}
-		BUG_ON(!buffer_uptodate(bh));
-		BUG_ON(!nr_bhs && (m_start != block_start));
-		BUG_ON(nr_bhs >= max_bhs);
-		bhs[nr_bhs++] = bh;
-		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
-	} while (block_start = block_end, (bh = bh->b_this_page) != head);
-	if (unlikely(rl))
-		up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
-	if (likely(!err)) {
-		/* Lock buffers and start synchronous write i/o on them. */
-		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
-			struct buffer_head *tbh = bhs[i_bhs];
-
-			if (!trylock_buffer(tbh))
-				BUG();
-			BUG_ON(!buffer_uptodate(tbh));
-			clear_buffer_dirty(tbh);
-			get_bh(tbh);
-			tbh->b_end_io = end_buffer_write_sync;
-			submit_bh(REQ_OP_WRITE, tbh);
-		}
-		/* Wait on i/o completion of buffers. */
-		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
-			struct buffer_head *tbh = bhs[i_bhs];
-
-			wait_on_buffer(tbh);
-			if (unlikely(!buffer_uptodate(tbh))) {
-				err = -EIO;
-				/*
-				 * Set the buffer uptodate so the page and
-				 * buffer states do not become out of sync.
-				 */
-				set_buffer_uptodate(tbh);
-			}
-		}
-	} else /* if (unlikely(err)) */ {
-		/* Clean the buffers. */
-		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
-			clear_buffer_dirty(bhs[i_bhs]);
-	}
-	/* Current state: all buffers are clean, unlocked, and uptodate. */
-	/* Remove the mst protection fixups again. */
-	post_write_mst_fixup((NTFS_RECORD*)kmirr);
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	unlock_page(page);
-	ntfs_unmap_page(page);
-	if (likely(!err)) {
-		ntfs_debug("Done.");
-	} else {
-		ntfs_error(vol->sb, "I/O error while writing mft mirror "
-				"record 0x%lx!", mft_no);
-err_out:
-		ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
-				"code %i).  Volume will be left marked dirty "
-				"on umount.  Run ntfsfix on the partition "
-				"after umounting to correct this.", -err);
-		NVolSetErrors(vol);
-	}
-	return err;
-}
-
-/**
- * write_mft_record_nolock - write out a mapped (extent) mft record
- * @ni:		ntfs inode describing the mapped (extent) mft record
- * @m:		mapped (extent) mft record to write
- * @sync:	if true, wait for i/o completion
- *
- * Write the mapped (extent) mft record @m described by the (regular or extent)
- * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
- * the mft mirror, that is also updated.
- *
- * We only write the mft record if the ntfs inode @ni is dirty and the first
- * buffer belonging to its mft record is dirty, too.  We ignore the dirty state
- * of subsequent buffers because we could have raced with
- * fs/ntfs/aops.c::mark_ntfs_record_dirty().
- *
- * On success, clean the mft record and return 0.  On error, leave the mft
- * record dirty and return -errno.
- *
- * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
- * However, if the mft record has a counterpart in the mft mirror and @sync is
- * true, we write the mft record, wait for i/o completion, and only then write
- * the mft mirror copy.  This ensures that if the system crashes either the mft
- * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
- * false on the other hand, we start i/o on both and then wait for completion
- * on them.  This provides a speedup but no longer guarantees that you will end
- * up with a self-consistent mft record in the case of a crash but if you asked
- * for asynchronous writing you probably do not care about that anyway.
- *
- * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
- * schedule i/o via ->writepage or do it via kntfsd or whatever.
- */
-int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
-{
-	ntfs_volume *vol = ni->vol;
-	struct page *page = ni->page;
-	unsigned int blocksize = vol->sb->s_blocksize;
-	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
-	int max_bhs = vol->mft_record_size / blocksize;
-	struct buffer_head *bhs[MAX_BHS];
-	struct buffer_head *bh, *head;
-	runlist_element *rl;
-	unsigned int block_start, block_end, m_start, m_end;
-	int i_bhs, nr_bhs, err = 0;
-
-	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
-	BUG_ON(NInoAttr(ni));
-	BUG_ON(!max_bhs);
-	BUG_ON(!PageLocked(page));
-	if (WARN_ON(max_bhs > MAX_BHS)) {
-		err = -EINVAL;
-		goto err_out;
-	}
-	/*
-	 * If the ntfs_inode is clean no need to do anything.  If it is dirty,
-	 * mark it as clean now so that it can be redirtied later on if needed.
-	 * There is no danger of races since the caller is holding the locks
-	 * for the mft record @m and the page it is in.
-	 */
-	if (!NInoTestClearDirty(ni))
-		goto done;
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
-	rl = NULL;
-	nr_bhs = 0;
-	block_start = 0;
-	m_start = ni->page_ofs;
-	m_end = m_start + vol->mft_record_size;
-	do {
-		block_end = block_start + blocksize;
-		/* If the buffer is outside the mft record, skip it. */
-		if (block_end <= m_start)
-			continue;
-		if (unlikely(block_start >= m_end))
-			break;
-		/*
-		 * If this block is not the first one in the record, we ignore
-		 * the buffer's dirty state because we could have raced with a
-		 * parallel mark_ntfs_record_dirty().
-		 */
-		if (block_start == m_start) {
-			/* This block is the first one in the record. */
-			if (!buffer_dirty(bh)) {
-				BUG_ON(nr_bhs);
-				/* Clean records are not written out. */
-				break;
-			}
-		}
-		/* Need to map the buffer if it is not mapped already. */
-		if (unlikely(!buffer_mapped(bh))) {
-			VCN vcn;
-			LCN lcn;
-			unsigned int vcn_ofs;
-
-			bh->b_bdev = vol->sb->s_bdev;
-			/* Obtain the vcn and offset of the current block. */
-			vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
-					(block_start - m_start);
-			vcn_ofs = vcn & vol->cluster_size_mask;
-			vcn >>= vol->cluster_size_bits;
-			if (!rl) {
-				down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
-				rl = NTFS_I(vol->mft_ino)->runlist.rl;
-				BUG_ON(!rl);
-			}
-			/* Seek to element containing target vcn. */
-			while (rl->length && rl[1].vcn <= vcn)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-			/* For $MFT, only lcn >= 0 is a successful remap. */
-			if (likely(lcn >= 0)) {
-				/* Setup buffer head to correct block. */
-				bh->b_blocknr = ((lcn <<
-						vol->cluster_size_bits) +
-						vcn_ofs) >> blocksize_bits;
-				set_buffer_mapped(bh);
-			} else {
-				bh->b_blocknr = -1;
-				ntfs_error(vol->sb, "Cannot write mft record "
-						"0x%lx because its location "
-						"on disk could not be "
-						"determined (error code %lli).",
-						ni->mft_no, (long long)lcn);
-				err = -EIO;
-			}
-		}
-		BUG_ON(!buffer_uptodate(bh));
-		BUG_ON(!nr_bhs && (m_start != block_start));
-		BUG_ON(nr_bhs >= max_bhs);
-		bhs[nr_bhs++] = bh;
-		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
-	} while (block_start = block_end, (bh = bh->b_this_page) != head);
-	if (unlikely(rl))
-		up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
-	if (!nr_bhs)
-		goto done;
-	if (unlikely(err))
-		goto cleanup_out;
-	/* Apply the mst protection fixups. */
-	err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
-	if (err) {
-		ntfs_error(vol->sb, "Failed to apply mst fixups!");
-		goto cleanup_out;
-	}
-	flush_dcache_mft_record_page(ni);
-	/* Lock buffers and start synchronous write i/o on them. */
-	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
-		struct buffer_head *tbh = bhs[i_bhs];
-
-		if (!trylock_buffer(tbh))
-			BUG();
-		BUG_ON(!buffer_uptodate(tbh));
-		clear_buffer_dirty(tbh);
-		get_bh(tbh);
-		tbh->b_end_io = end_buffer_write_sync;
-		submit_bh(REQ_OP_WRITE, tbh);
-	}
-	/* Synchronize the mft mirror now if not @sync. */
-	if (!sync && ni->mft_no < vol->mftmirr_size)
-		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
-	/* Wait on i/o completion of buffers. */
-	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
-		struct buffer_head *tbh = bhs[i_bhs];
-
-		wait_on_buffer(tbh);
-		if (unlikely(!buffer_uptodate(tbh))) {
-			err = -EIO;
-			/*
-			 * Set the buffer uptodate so the page and buffer
-			 * states do not become out of sync.
-			 */
-			if (PageUptodate(page))
-				set_buffer_uptodate(tbh);
-		}
-	}
-	/* If @sync, now synchronize the mft mirror. */
-	if (sync && ni->mft_no < vol->mftmirr_size)
-		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
-	/* Remove the mst protection fixups again. */
-	post_write_mst_fixup((NTFS_RECORD*)m);
-	flush_dcache_mft_record_page(ni);
-	if (unlikely(err)) {
-		/* I/O error during writing.  This is really bad! */
-		ntfs_error(vol->sb, "I/O error while writing mft record "
-				"0x%lx!  Marking base inode as bad.  You "
-				"should unmount the volume and run chkdsk.",
-				ni->mft_no);
-		goto err_out;
-	}
-done:
-	ntfs_debug("Done.");
-	return 0;
-cleanup_out:
-	/* Clean the buffers. */
-	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
-		clear_buffer_dirty(bhs[i_bhs]);
-err_out:
-	/*
-	 * Current state: all buffers are clean, unlocked, and uptodate.
-	 * The caller should mark the base inode as bad so that no more i/o
-	 * happens.  ->clear_inode() will still be invoked so all extent inodes
-	 * and other allocated memory will be freed.
-	 */
-	if (err == -ENOMEM) {
-		ntfs_error(vol->sb, "Not enough memory to write mft record.  "
-				"Redirtying so the write is retried later.");
-		mark_mft_record_dirty(ni);
-		err = 0;
-	} else
-		NVolSetErrors(vol);
-	return err;
-}
-
-/**
- * ntfs_may_write_mft_record - check if an mft record may be written out
- * @vol:	[IN]  ntfs volume on which the mft record to check resides
- * @mft_no:	[IN]  mft record number of the mft record to check
- * @m:		[IN]  mapped mft record to check
- * @locked_ni:	[OUT] caller has to unlock this ntfs inode if one is returned
- *
- * Check if the mapped (base or extent) mft record @m with mft record number
- * @mft_no belonging to the ntfs volume @vol may be written out.  If necessary
- * and possible the ntfs inode of the mft record is locked and the base vfs
- * inode is pinned.  The locked ntfs inode is then returned in @locked_ni.  The
- * caller is responsible for unlocking the ntfs inode and unpinning the base
- * vfs inode.
- *
- * Return 'true' if the mft record may be written out and 'false' if not.
- *
- * The caller has locked the page and cleared the uptodate flag on it which
- * means that we can safely write out any dirty mft records that do not have
- * their inodes in icache as determined by ilookup5() as anyone
- * opening/creating such an inode would block when attempting to map the mft
- * record in read_cache_page() until we are finished with the write out.
- *
- * Here is a description of the tests we perform:
- *
- * If the inode is found in icache we know the mft record must be a base mft
- * record.  If it is dirty, we do not write it and return 'false' as the vfs
- * inode write paths will result in the access times being updated which would
- * cause the base mft record to be redirtied and written out again.  (We know
- * the access time update will modify the base mft record because Windows
- * chkdsk complains if the standard information attribute is not in the base
- * mft record.)
- *
- * If the inode is in icache and not dirty, we attempt to lock the mft record
- * and if we find the lock was already taken, it is not safe to write the mft
- * record and we return 'false'.
- *
- * If we manage to obtain the lock we have exclusive access to the mft record,
- * which also allows us safe writeout of the mft record.  We then set
- * @locked_ni to the locked ntfs inode and return 'true'.
- *
- * Note we cannot just lock the mft record and sleep while waiting for the lock
- * because this would deadlock due to lock reversal (normally the mft record is
- * locked before the page is locked but we already have the page locked here
- * when we try to lock the mft record).
- *
- * If the inode is not in icache we need to perform further checks.
- *
- * If the mft record is not a FILE record or it is a base mft record, we can
- * safely write it and return 'true'.
- *
- * We now know the mft record is an extent mft record.  We check if the inode
- * corresponding to its base mft record is in icache and obtain a reference to
- * it if it is.  If it is not, we can safely write it and return 'true'.
- *
- * We now have the base inode for the extent mft record.  We check if it has an
- * ntfs inode for the extent mft record attached and if not it is safe to write
- * the extent mft record and we return 'true'.
- *
- * The ntfs inode for the extent mft record is attached to the base inode so we
- * attempt to lock the extent mft record and if we find the lock was already
- * taken, it is not safe to write the extent mft record and we return 'false'.
- *
- * If we manage to obtain the lock we have exclusive access to the extent mft
- * record, which also allows us safe writeout of the extent mft record.  We
- * set the ntfs inode of the extent mft record clean and then set @locked_ni to
- * the now locked ntfs inode and return 'true'.
- *
- * Note, the reason for actually writing dirty mft records here and not just
- * relying on the vfs inode dirty code paths is that we can have mft records
- * modified without them ever having actual inodes in memory.  Also we can have
- * dirty mft records with clean ntfs inodes in memory.  None of the described
- * cases would result in the dirty mft records being written out if we only
- * relied on the vfs inode dirty code paths.  And these cases can really occur
- * during allocation of new mft records and in particular when the
- * initialized_size of the $MFT/$DATA attribute is extended and the new space
- * is initialized using ntfs_mft_record_format().  The clean inode can then
- * appear if the mft record is reused for a new inode before it got written
- * out.
- */
-bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
-		const MFT_RECORD *m, ntfs_inode **locked_ni)
-{
-	struct super_block *sb = vol->sb;
-	struct inode *mft_vi = vol->mft_ino;
-	struct inode *vi;
-	ntfs_inode *ni, *eni, **extent_nis;
-	int i;
-	ntfs_attr na;
-
-	ntfs_debug("Entering for inode 0x%lx.", mft_no);
-	/*
-	 * Normally we do not return a locked inode so set @locked_ni to NULL.
-	 */
-	BUG_ON(!locked_ni);
-	*locked_ni = NULL;
-	/*
-	 * Check if the inode corresponding to this mft record is in the VFS
-	 * inode cache and obtain a reference to it if it is.
-	 */
-	ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
-	na.mft_no = mft_no;
-	na.name = NULL;
-	na.name_len = 0;
-	na.type = AT_UNUSED;
-	/*
-	 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
-	 * we get here for it rather often.
-	 */
-	if (!mft_no) {
-		/* Balance the below iput(). */
-		vi = igrab(mft_vi);
-		BUG_ON(vi != mft_vi);
-	} else {
-		/*
-		 * Have to use ilookup5_nowait() since ilookup5() waits for the
-		 * inode lock which causes ntfs to deadlock when a concurrent
-		 * inode write via the inode dirty code paths and the page
-		 * dirty code path of the inode dirty code path when writing
-		 * $MFT occurs.
-		 */
-		vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na);
-	}
-	if (vi) {
-		ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
-		/* The inode is in icache. */
-		ni = NTFS_I(vi);
-		/* Take a reference to the ntfs inode. */
-		atomic_inc(&ni->count);
-		/* If the inode is dirty, do not write this record. */
-		if (NInoDirty(ni)) {
-			ntfs_debug("Inode 0x%lx is dirty, do not write it.",
-					mft_no);
-			atomic_dec(&ni->count);
-			iput(vi);
-			return false;
-		}
-		ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
-		/* The inode is not dirty, try to take the mft record lock. */
-		if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
-			ntfs_debug("Mft record 0x%lx is already locked, do "
-					"not write it.", mft_no);
-			atomic_dec(&ni->count);
-			iput(vi);
-			return false;
-		}
-		ntfs_debug("Managed to lock mft record 0x%lx, write it.",
-				mft_no);
-		/*
-		 * The write has to occur while we hold the mft record lock so
-		 * return the locked ntfs inode.
-		 */
-		*locked_ni = ni;
-		return true;
-	}
-	ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
-	/* The inode is not in icache. */
-	/* Write the record if it is not a mft record (type "FILE"). */
-	if (!ntfs_is_mft_record(m->magic)) {
-		ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
-				mft_no);
-		return true;
-	}
-	/* Write the mft record if it is a base inode. */
-	if (!m->base_mft_record) {
-		ntfs_debug("Mft record 0x%lx is a base record, write it.",
-				mft_no);
-		return true;
-	}
-	/*
-	 * This is an extent mft record.  Check if the inode corresponding to
-	 * its base mft record is in icache and obtain a reference to it if it
-	 * is.
-	 */
-	na.mft_no = MREF_LE(m->base_mft_record);
-	ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
-			"inode 0x%lx in icache.", mft_no, na.mft_no);
-	if (!na.mft_no) {
-		/* Balance the below iput(). */
-		vi = igrab(mft_vi);
-		BUG_ON(vi != mft_vi);
-	} else
-		vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode,
-				&na);
-	if (!vi) {
-		/*
-		 * The base inode is not in icache, write this extent mft
-		 * record.
-		 */
-		ntfs_debug("Base inode 0x%lx is not in icache, write the "
-				"extent record.", na.mft_no);
-		return true;
-	}
-	ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
-	/*
-	 * The base inode is in icache.  Check if it has the extent inode
-	 * corresponding to this extent mft record attached.
-	 */
-	ni = NTFS_I(vi);
-	mutex_lock(&ni->extent_lock);
-	if (ni->nr_extents <= 0) {
-		/*
-		 * The base inode has no attached extent inodes, write this
-		 * extent mft record.
-		 */
-		mutex_unlock(&ni->extent_lock);
-		iput(vi);
-		ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
-				"write the extent record.", na.mft_no);
-		return true;
-	}
-	/* Iterate over the attached extent inodes. */
-	extent_nis = ni->ext.extent_ntfs_inos;
-	for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
-		if (mft_no == extent_nis[i]->mft_no) {
-			/*
-			 * Found the extent inode corresponding to this extent
-			 * mft record.
-			 */
-			eni = extent_nis[i];
-			break;
-		}
-	}
-	/*
-	 * If the extent inode was not attached to the base inode, write this
-	 * extent mft record.
-	 */
-	if (!eni) {
-		mutex_unlock(&ni->extent_lock);
-		iput(vi);
-		ntfs_debug("Extent inode 0x%lx is not attached to its base "
-				"inode 0x%lx, write the extent record.",
-				mft_no, na.mft_no);
-		return true;
-	}
-	ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
-			mft_no, na.mft_no);
-	/* Take a reference to the extent ntfs inode. */
-	atomic_inc(&eni->count);
-	mutex_unlock(&ni->extent_lock);
-	/*
-	 * Found the extent inode coresponding to this extent mft record.
-	 * Try to take the mft record lock.
-	 */
-	if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
-		atomic_dec(&eni->count);
-		iput(vi);
-		ntfs_debug("Extent mft record 0x%lx is already locked, do "
-				"not write it.", mft_no);
-		return false;
-	}
-	ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
-			mft_no);
-	if (NInoTestClearDirty(eni))
-		ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
-				mft_no);
-	/*
-	 * The write has to occur while we hold the mft record lock so return
-	 * the locked extent ntfs inode.
-	 */
-	*locked_ni = eni;
-	return true;
-}
-
-static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
-		"chkdsk.";
-
-/**
- * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
- * @vol:	volume on which to search for a free mft record
- * @base_ni:	open base inode if allocating an extent mft record or NULL
- *
- * Search for a free mft record in the mft bitmap attribute on the ntfs volume
- * @vol.
- *
- * If @base_ni is NULL start the search at the default allocator position.
- *
- * If @base_ni is not NULL start the search at the mft record after the base
- * mft record @base_ni.
- *
- * Return the free mft record on success and -errno on error.  An error code of
- * -ENOSPC means that there are no free mft records in the currently
- * initialized mft bitmap.
- *
- * Locking: Caller must hold vol->mftbmp_lock for writing.
- */
-static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
-		ntfs_inode *base_ni)
-{
-	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
-	unsigned long flags;
-	struct address_space *mftbmp_mapping;
-	u8 *buf, *byte;
-	struct page *page;
-	unsigned int page_ofs, size;
-	u8 pass, b;
-
-	ntfs_debug("Searching for free mft record in the currently "
-			"initialized mft bitmap.");
-	mftbmp_mapping = vol->mftbmp_ino->i_mapping;
-	/*
-	 * Set the end of the pass making sure we do not overflow the mft
-	 * bitmap.
-	 */
-	read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
-	pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
-			vol->mft_record_size_bits;
-	read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
-	read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
-	ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
-	read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
-	if (pass_end > ll)
-		pass_end = ll;
-	pass = 1;
-	if (!base_ni)
-		data_pos = vol->mft_data_pos;
-	else
-		data_pos = base_ni->mft_no + 1;
-	if (data_pos < 24)
-		data_pos = 24;
-	if (data_pos >= pass_end) {
-		data_pos = 24;
-		pass = 2;
-		/* This happens on a freshly formatted volume. */
-		if (data_pos >= pass_end)
-			return -ENOSPC;
-	}
-	pass_start = data_pos;
-	ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
-			"pass_end 0x%llx, data_pos 0x%llx.", pass,
-			(long long)pass_start, (long long)pass_end,
-			(long long)data_pos);
-	/* Loop until a free mft record is found. */
-	for (; pass <= 2;) {
-		/* Cap size to pass_end. */
-		ofs = data_pos >> 3;
-		page_ofs = ofs & ~PAGE_MASK;
-		size = PAGE_SIZE - page_ofs;
-		ll = ((pass_end + 7) >> 3) - ofs;
-		if (size > ll)
-			size = ll;
-		size <<= 3;
-		/*
-		 * If we are still within the active pass, search the next page
-		 * for a zero bit.
-		 */
-		if (size) {
-			page = ntfs_map_page(mftbmp_mapping,
-					ofs >> PAGE_SHIFT);
-			if (IS_ERR(page)) {
-				ntfs_error(vol->sb, "Failed to read mft "
-						"bitmap, aborting.");
-				return PTR_ERR(page);
-			}
-			buf = (u8*)page_address(page) + page_ofs;
-			bit = data_pos & 7;
-			data_pos &= ~7ull;
-			ntfs_debug("Before inner for loop: size 0x%x, "
-					"data_pos 0x%llx, bit 0x%llx", size,
-					(long long)data_pos, (long long)bit);
-			for (; bit < size && data_pos + bit < pass_end;
-					bit &= ~7ull, bit += 8) {
-				byte = buf + (bit >> 3);
-				if (*byte == 0xff)
-					continue;
-				b = ffz((unsigned long)*byte);
-				if (b < 8 && b >= (bit & 7)) {
-					ll = data_pos + (bit & ~7ull) + b;
-					if (unlikely(ll > (1ll << 32))) {
-						ntfs_unmap_page(page);
-						return -ENOSPC;
-					}
-					*byte |= 1 << b;
-					flush_dcache_page(page);
-					set_page_dirty(page);
-					ntfs_unmap_page(page);
-					ntfs_debug("Done.  (Found and "
-							"allocated mft record "
-							"0x%llx.)",
-							(long long)ll);
-					return ll;
-				}
-			}
-			ntfs_debug("After inner for loop: size 0x%x, "
-					"data_pos 0x%llx, bit 0x%llx", size,
-					(long long)data_pos, (long long)bit);
-			data_pos += size;
-			ntfs_unmap_page(page);
-			/*
-			 * If the end of the pass has not been reached yet,
-			 * continue searching the mft bitmap for a zero bit.
-			 */
-			if (data_pos < pass_end)
-				continue;
-		}
-		/* Do the next pass. */
-		if (++pass == 2) {
-			/*
-			 * Starting the second pass, in which we scan the first
-			 * part of the zone which we omitted earlier.
-			 */
-			pass_end = pass_start;
-			data_pos = pass_start = 24;
-			ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
-					"0x%llx.", pass, (long long)pass_start,
-					(long long)pass_end);
-			if (data_pos >= pass_end)
-				break;
-		}
-	}
-	/* No free mft records in currently initialized mft bitmap. */
-	ntfs_debug("Done.  (No free mft records left in currently initialized "
-			"mft bitmap.)");
-	return -ENOSPC;
-}
-
-/**
- * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
- * @vol:	volume on which to extend the mft bitmap attribute
- *
- * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
- *
- * Note: Only changes allocated_size, i.e. does not touch initialized_size or
- * data_size.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - Caller must hold vol->mftbmp_lock for writing.
- *	    - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
- *	      writing and releases it before returning.
- *	    - This function takes vol->lcnbmp_lock for writing and releases it
- *	      before returning.
- */
-static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
-{
-	LCN lcn;
-	s64 ll;
-	unsigned long flags;
-	struct page *page;
-	ntfs_inode *mft_ni, *mftbmp_ni;
-	runlist_element *rl, *rl2 = NULL;
-	ntfs_attr_search_ctx *ctx = NULL;
-	MFT_RECORD *mrec;
-	ATTR_RECORD *a = NULL;
-	int ret, mp_size;
-	u32 old_alen = 0;
-	u8 *b, tb;
-	struct {
-		u8 added_cluster:1;
-		u8 added_run:1;
-		u8 mp_rebuilt:1;
-	} status = { 0, 0, 0 };
-
-	ntfs_debug("Extending mft bitmap allocation.");
-	mft_ni = NTFS_I(vol->mft_ino);
-	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
-	/*
-	 * Determine the last lcn of the mft bitmap.  The allocated size of the
-	 * mft bitmap cannot be zero so we are ok to do this.
-	 */
-	down_write(&mftbmp_ni->runlist.lock);
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	ll = mftbmp_ni->allocated_size;
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
-			(ll - 1) >> vol->cluster_size_bits, NULL);
-	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
-		up_write(&mftbmp_ni->runlist.lock);
-		ntfs_error(vol->sb, "Failed to determine last allocated "
-				"cluster of mft bitmap attribute.");
-		if (!IS_ERR(rl))
-			ret = -EIO;
-		else
-			ret = PTR_ERR(rl);
-		return ret;
-	}
-	lcn = rl->lcn + rl->length;
-	ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
-			(long long)lcn);
-	/*
-	 * Attempt to get the cluster following the last allocated cluster by
-	 * hand as it may be in the MFT zone so the allocator would not give it
-	 * to us.
-	 */
-	ll = lcn >> 3;
-	page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
-			ll >> PAGE_SHIFT);
-	if (IS_ERR(page)) {
-		up_write(&mftbmp_ni->runlist.lock);
-		ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
-		return PTR_ERR(page);
-	}
-	b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
-	tb = 1 << (lcn & 7ull);
-	down_write(&vol->lcnbmp_lock);
-	if (*b != 0xff && !(*b & tb)) {
-		/* Next cluster is free, allocate it. */
-		*b |= tb;
-		flush_dcache_page(page);
-		set_page_dirty(page);
-		up_write(&vol->lcnbmp_lock);
-		ntfs_unmap_page(page);
-		/* Update the mft bitmap runlist. */
-		rl->length++;
-		rl[1].vcn++;
-		status.added_cluster = 1;
-		ntfs_debug("Appending one cluster to mft bitmap.");
-	} else {
-		up_write(&vol->lcnbmp_lock);
-		ntfs_unmap_page(page);
-		/* Allocate a cluster from the DATA_ZONE. */
-		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
-				true);
-		if (IS_ERR(rl2)) {
-			up_write(&mftbmp_ni->runlist.lock);
-			ntfs_error(vol->sb, "Failed to allocate a cluster for "
-					"the mft bitmap.");
-			return PTR_ERR(rl2);
-		}
-		rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
-		if (IS_ERR(rl)) {
-			up_write(&mftbmp_ni->runlist.lock);
-			ntfs_error(vol->sb, "Failed to merge runlists for mft "
-					"bitmap.");
-			if (ntfs_cluster_free_from_rl(vol, rl2)) {
-				ntfs_error(vol->sb, "Failed to deallocate "
-						"allocated cluster.%s", es);
-				NVolSetErrors(vol);
-			}
-			ntfs_free(rl2);
-			return PTR_ERR(rl);
-		}
-		mftbmp_ni->runlist.rl = rl;
-		status.added_run = 1;
-		ntfs_debug("Adding one run to mft bitmap.");
-		/* Find the last run in the new runlist. */
-		for (; rl[1].length; rl++)
-			;
-	}
-	/*
-	 * Update the attribute record as well.  Note: @rl is the last
-	 * (non-terminator) runlist element of mft bitmap.
-	 */
-	mrec = map_mft_record(mft_ni);
-	if (IS_ERR(mrec)) {
-		ntfs_error(vol->sb, "Failed to map mft record.");
-		ret = PTR_ERR(mrec);
-		goto undo_alloc;
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.");
-		ret = -ENOMEM;
-		goto undo_alloc;
-	}
-	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
-			0, ctx);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"mft bitmap attribute.");
-		if (ret == -ENOENT)
-			ret = -EIO;
-		goto undo_alloc;
-	}
-	a = ctx->attr;
-	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
-	/* Search back for the previous last allocated cluster of mft bitmap. */
-	for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
-		if (ll >= rl2->vcn)
-			break;
-	}
-	BUG_ON(ll < rl2->vcn);
-	BUG_ON(ll >= rl2->vcn + rl2->length);
-	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
-	if (unlikely(mp_size <= 0)) {
-		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
-				"mft bitmap attribute extent.");
-		ret = mp_size;
-		if (!ret)
-			ret = -EIO;
-		goto undo_alloc;
-	}
-	/* Expand the attribute record if necessary. */
-	old_alen = le32_to_cpu(a->length);
-	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
-	if (unlikely(ret)) {
-		if (ret != -ENOSPC) {
-			ntfs_error(vol->sb, "Failed to resize attribute "
-					"record for mft bitmap attribute.");
-			goto undo_alloc;
-		}
-		// TODO: Deal with this by moving this extent to a new mft
-		// record or by starting a new extent in a new mft record or by
-		// moving other attributes out of this mft record.
-		// Note: It will need to be a special mft record and if none of
-		// those are available it gets rather complicated...
-		ntfs_error(vol->sb, "Not enough space in this mft record to "
-				"accommodate extended mft bitmap attribute "
-				"extent.  Cannot handle this yet.");
-		ret = -EOPNOTSUPP;
-		goto undo_alloc;
-	}
-	status.mp_rebuilt = 1;
-	/* Generate the mapping pairs array directly into the attr record. */
-	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, -1, NULL);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to build mapping pairs array for "
-				"mft bitmap attribute.");
-		goto undo_alloc;
-	}
-	/* Update the highest_vcn. */
-	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
-	/*
-	 * We now have extended the mft bitmap allocated_size by one cluster.
-	 * Reflect this in the ntfs_inode structure and the attribute record.
-	 */
-	if (a->data.non_resident.lowest_vcn) {
-		/*
-		 * We are not in the first attribute extent, switch to it, but
-		 * first ensure the changes will make it to disk later.
-		 */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		ntfs_attr_reinit_search_ctx(ctx);
-		ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-				mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
-				0, ctx);
-		if (unlikely(ret)) {
-			ntfs_error(vol->sb, "Failed to find first attribute "
-					"extent of mft bitmap attribute.");
-			goto restore_undo_alloc;
-		}
-		a = ctx->attr;
-	}
-	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	mftbmp_ni->allocated_size += vol->cluster_size;
-	a->data.non_resident.allocated_size =
-			cpu_to_sle64(mftbmp_ni->allocated_size);
-	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-	up_write(&mftbmp_ni->runlist.lock);
-	ntfs_debug("Done.");
-	return 0;
-restore_undo_alloc:
-	ntfs_attr_reinit_search_ctx(ctx);
-	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
-			0, ctx)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"mft bitmap attribute.%s", es);
-		write_lock_irqsave(&mftbmp_ni->size_lock, flags);
-		mftbmp_ni->allocated_size += vol->cluster_size;
-		write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(mft_ni);
-		up_write(&mftbmp_ni->runlist.lock);
-		/*
-		 * The only thing that is now wrong is ->allocated_size of the
-		 * base attribute extent which chkdsk should be able to fix.
-		 */
-		NVolSetErrors(vol);
-		return ret;
-	}
-	a = ctx->attr;
-	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
-undo_alloc:
-	if (status.added_cluster) {
-		/* Truncate the last run in the runlist by one cluster. */
-		rl->length--;
-		rl[1].vcn--;
-	} else if (status.added_run) {
-		lcn = rl->lcn;
-		/* Remove the last run from the runlist. */
-		rl->lcn = rl[1].lcn;
-		rl->length = 0;
-	}
-	/* Deallocate the cluster. */
-	down_write(&vol->lcnbmp_lock);
-	if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
-		ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
-		NVolSetErrors(vol);
-	}
-	up_write(&vol->lcnbmp_lock);
-	if (status.mp_rebuilt) {
-		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset),
-				old_alen - le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset),
-				rl2, ll, -1, NULL)) {
-			ntfs_error(vol->sb, "Failed to restore mapping pairs "
-					"array.%s", es);
-			NVolSetErrors(vol);
-		}
-		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
-			ntfs_error(vol->sb, "Failed to restore attribute "
-					"record.%s", es);
-			NVolSetErrors(vol);
-		}
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-	}
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (!IS_ERR(mrec))
-		unmap_mft_record(mft_ni);
-	up_write(&mftbmp_ni->runlist.lock);
-	return ret;
-}
-
-/**
- * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
- * @vol:	volume on which to extend the mft bitmap attribute
- *
- * Extend the initialized portion of the mft bitmap attribute on the ntfs
- * volume @vol by 8 bytes.
- *
- * Note:  Only changes initialized_size and data_size, i.e. requires that
- * allocated_size is big enough to fit the new initialized_size.
- *
- * Return 0 on success and -error on error.
- *
- * Locking: Caller must hold vol->mftbmp_lock for writing.
- */
-static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
-{
-	s64 old_data_size, old_initialized_size;
-	unsigned long flags;
-	struct inode *mftbmp_vi;
-	ntfs_inode *mft_ni, *mftbmp_ni;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *mrec;
-	ATTR_RECORD *a;
-	int ret;
-
-	ntfs_debug("Extending mft bitmap initiailized (and data) size.");
-	mft_ni = NTFS_I(vol->mft_ino);
-	mftbmp_vi = vol->mftbmp_ino;
-	mftbmp_ni = NTFS_I(mftbmp_vi);
-	/* Get the attribute record. */
-	mrec = map_mft_record(mft_ni);
-	if (IS_ERR(mrec)) {
-		ntfs_error(vol->sb, "Failed to map mft record.");
-		return PTR_ERR(mrec);
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.");
-		ret = -ENOMEM;
-		goto unm_err_out;
-	}
-	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to find first attribute extent of "
-				"mft bitmap attribute.");
-		if (ret == -ENOENT)
-			ret = -EIO;
-		goto put_err_out;
-	}
-	a = ctx->attr;
-	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	old_data_size = i_size_read(mftbmp_vi);
-	old_initialized_size = mftbmp_ni->initialized_size;
-	/*
-	 * We can simply update the initialized_size before filling the space
-	 * with zeroes because the caller is holding the mft bitmap lock for
-	 * writing which ensures that no one else is trying to access the data.
-	 */
-	mftbmp_ni->initialized_size += 8;
-	a->data.non_resident.initialized_size =
-			cpu_to_sle64(mftbmp_ni->initialized_size);
-	if (mftbmp_ni->initialized_size > old_data_size) {
-		i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
-		a->data.non_resident.data_size =
-				cpu_to_sle64(mftbmp_ni->initialized_size);
-	}
-	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-	/* Initialize the mft bitmap attribute value with zeroes. */
-	ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
-	if (likely(!ret)) {
-		ntfs_debug("Done.  (Wrote eight initialized bytes to mft "
-				"bitmap.");
-		return 0;
-	}
-	ntfs_error(vol->sb, "Failed to write to mft bitmap.");
-	/* Try to recover from the error. */
-	mrec = map_mft_record(mft_ni);
-	if (IS_ERR(mrec)) {
-		ntfs_error(vol->sb, "Failed to map mft record.%s", es);
-		NVolSetErrors(vol);
-		return ret;
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.%s", es);
-		NVolSetErrors(vol);
-		goto unm_err_out;
-	}
-	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
-		ntfs_error(vol->sb, "Failed to find first attribute extent of "
-				"mft bitmap attribute.%s", es);
-		NVolSetErrors(vol);
-put_err_out:
-		ntfs_attr_put_search_ctx(ctx);
-unm_err_out:
-		unmap_mft_record(mft_ni);
-		goto err_out;
-	}
-	a = ctx->attr;
-	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	mftbmp_ni->initialized_size = old_initialized_size;
-	a->data.non_resident.initialized_size =
-			cpu_to_sle64(old_initialized_size);
-	if (i_size_read(mftbmp_vi) != old_data_size) {
-		i_size_write(mftbmp_vi, old_data_size);
-		a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
-	}
-	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-#ifdef DEBUG
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
-			"data_size 0x%llx, initialized_size 0x%llx.",
-			(long long)mftbmp_ni->allocated_size,
-			(long long)i_size_read(mftbmp_vi),
-			(long long)mftbmp_ni->initialized_size);
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
-err_out:
-	return ret;
-}
-
-/**
- * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
- * @vol:	volume on which to extend the mft data attribute
- *
- * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
- * worth of clusters or if not enough space for this by one mft record worth
- * of clusters.
- *
- * Note:  Only changes allocated_size, i.e. does not touch initialized_size or
- * data_size.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - Caller must hold vol->mftbmp_lock for writing.
- *	    - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
- *	      writing and releases it before returning.
- *	    - This function calls functions which take vol->lcnbmp_lock for
- *	      writing and release it before returning.
- */
-static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
-{
-	LCN lcn;
-	VCN old_last_vcn;
-	s64 min_nr, nr, ll;
-	unsigned long flags;
-	ntfs_inode *mft_ni;
-	runlist_element *rl, *rl2;
-	ntfs_attr_search_ctx *ctx = NULL;
-	MFT_RECORD *mrec;
-	ATTR_RECORD *a = NULL;
-	int ret, mp_size;
-	u32 old_alen = 0;
-	bool mp_rebuilt = false;
-
-	ntfs_debug("Extending mft data allocation.");
-	mft_ni = NTFS_I(vol->mft_ino);
-	/*
-	 * Determine the preferred allocation location, i.e. the last lcn of
-	 * the mft data attribute.  The allocated size of the mft data
-	 * attribute cannot be zero so we are ok to do this.
-	 */
-	down_write(&mft_ni->runlist.lock);
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ll = mft_ni->allocated_size;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	rl = ntfs_attr_find_vcn_nolock(mft_ni,
-			(ll - 1) >> vol->cluster_size_bits, NULL);
-	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
-		up_write(&mft_ni->runlist.lock);
-		ntfs_error(vol->sb, "Failed to determine last allocated "
-				"cluster of mft data attribute.");
-		if (!IS_ERR(rl))
-			ret = -EIO;
-		else
-			ret = PTR_ERR(rl);
-		return ret;
-	}
-	lcn = rl->lcn + rl->length;
-	ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
-	/* Minimum allocation is one mft record worth of clusters. */
-	min_nr = vol->mft_record_size >> vol->cluster_size_bits;
-	if (!min_nr)
-		min_nr = 1;
-	/* Want to allocate 16 mft records worth of clusters. */
-	nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
-	if (!nr)
-		nr = min_nr;
-	/* Ensure we do not go above 2^32-1 mft records. */
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ll = mft_ni->allocated_size;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
-			vol->mft_record_size_bits >= (1ll << 32))) {
-		nr = min_nr;
-		if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
-				vol->mft_record_size_bits >= (1ll << 32))) {
-			ntfs_warning(vol->sb, "Cannot allocate mft record "
-					"because the maximum number of inodes "
-					"(2^32) has already been reached.");
-			up_write(&mft_ni->runlist.lock);
-			return -ENOSPC;
-		}
-	}
-	ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
-			nr > min_nr ? "default" : "minimal", (long long)nr);
-	old_last_vcn = rl[1].vcn;
-	do {
-		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
-				true);
-		if (!IS_ERR(rl2))
-			break;
-		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
-			ntfs_error(vol->sb, "Failed to allocate the minimal "
-					"number of clusters (%lli) for the "
-					"mft data attribute.", (long long)nr);
-			up_write(&mft_ni->runlist.lock);
-			return PTR_ERR(rl2);
-		}
-		/*
-		 * There is not enough space to do the allocation, but there
-		 * might be enough space to do a minimal allocation so try that
-		 * before failing.
-		 */
-		nr = min_nr;
-		ntfs_debug("Retrying mft data allocation with minimal cluster "
-				"count %lli.", (long long)nr);
-	} while (1);
-	rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
-	if (IS_ERR(rl)) {
-		up_write(&mft_ni->runlist.lock);
-		ntfs_error(vol->sb, "Failed to merge runlists for mft data "
-				"attribute.");
-		if (ntfs_cluster_free_from_rl(vol, rl2)) {
-			ntfs_error(vol->sb, "Failed to deallocate clusters "
-					"from the mft data attribute.%s", es);
-			NVolSetErrors(vol);
-		}
-		ntfs_free(rl2);
-		return PTR_ERR(rl);
-	}
-	mft_ni->runlist.rl = rl;
-	ntfs_debug("Allocated %lli clusters.", (long long)nr);
-	/* Find the last run in the new runlist. */
-	for (; rl[1].length; rl++)
-		;
-	/* Update the attribute record as well. */
-	mrec = map_mft_record(mft_ni);
-	if (IS_ERR(mrec)) {
-		ntfs_error(vol->sb, "Failed to map mft record.");
-		ret = PTR_ERR(mrec);
-		goto undo_alloc;
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.");
-		ret = -ENOMEM;
-		goto undo_alloc;
-	}
-	ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
-			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"mft data attribute.");
-		if (ret == -ENOENT)
-			ret = -EIO;
-		goto undo_alloc;
-	}
-	a = ctx->attr;
-	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
-	/* Search back for the previous last allocated cluster of mft bitmap. */
-	for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
-		if (ll >= rl2->vcn)
-			break;
-	}
-	BUG_ON(ll < rl2->vcn);
-	BUG_ON(ll >= rl2->vcn + rl2->length);
-	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
-	if (unlikely(mp_size <= 0)) {
-		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
-				"mft data attribute extent.");
-		ret = mp_size;
-		if (!ret)
-			ret = -EIO;
-		goto undo_alloc;
-	}
-	/* Expand the attribute record if necessary. */
-	old_alen = le32_to_cpu(a->length);
-	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
-	if (unlikely(ret)) {
-		if (ret != -ENOSPC) {
-			ntfs_error(vol->sb, "Failed to resize attribute "
-					"record for mft data attribute.");
-			goto undo_alloc;
-		}
-		// TODO: Deal with this by moving this extent to a new mft
-		// record or by starting a new extent in a new mft record or by
-		// moving other attributes out of this mft record.
-		// Note: Use the special reserved mft records and ensure that
-		// this extent is not required to find the mft record in
-		// question.  If no free special records left we would need to
-		// move an existing record away, insert ours in its place, and
-		// then place the moved record into the newly allocated space
-		// and we would then need to update all references to this mft
-		// record appropriately.  This is rather complicated...
-		ntfs_error(vol->sb, "Not enough space in this mft record to "
-				"accommodate extended mft data attribute "
-				"extent.  Cannot handle this yet.");
-		ret = -EOPNOTSUPP;
-		goto undo_alloc;
-	}
-	mp_rebuilt = true;
-	/* Generate the mapping pairs array directly into the attr record. */
-	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, -1, NULL);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to build mapping pairs array of "
-				"mft data attribute.");
-		goto undo_alloc;
-	}
-	/* Update the highest_vcn. */
-	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
-	/*
-	 * We now have extended the mft data allocated_size by nr clusters.
-	 * Reflect this in the ntfs_inode structure and the attribute record.
-	 * @rl is the last (non-terminator) runlist element of mft data
-	 * attribute.
-	 */
-	if (a->data.non_resident.lowest_vcn) {
-		/*
-		 * We are not in the first attribute extent, switch to it, but
-		 * first ensure the changes will make it to disk later.
-		 */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		ntfs_attr_reinit_search_ctx(ctx);
-		ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
-				mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
-				ctx);
-		if (unlikely(ret)) {
-			ntfs_error(vol->sb, "Failed to find first attribute "
-					"extent of mft data attribute.");
-			goto restore_undo_alloc;
-		}
-		a = ctx->attr;
-	}
-	write_lock_irqsave(&mft_ni->size_lock, flags);
-	mft_ni->allocated_size += nr << vol->cluster_size_bits;
-	a->data.non_resident.allocated_size =
-			cpu_to_sle64(mft_ni->allocated_size);
-	write_unlock_irqrestore(&mft_ni->size_lock, flags);
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-	up_write(&mft_ni->runlist.lock);
-	ntfs_debug("Done.");
-	return 0;
-restore_undo_alloc:
-	ntfs_attr_reinit_search_ctx(ctx);
-	if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
-			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"mft data attribute.%s", es);
-		write_lock_irqsave(&mft_ni->size_lock, flags);
-		mft_ni->allocated_size += nr << vol->cluster_size_bits;
-		write_unlock_irqrestore(&mft_ni->size_lock, flags);
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(mft_ni);
-		up_write(&mft_ni->runlist.lock);
-		/*
-		 * The only thing that is now wrong is ->allocated_size of the
-		 * base attribute extent which chkdsk should be able to fix.
-		 */
-		NVolSetErrors(vol);
-		return ret;
-	}
-	ctx->attr->data.non_resident.highest_vcn =
-			cpu_to_sle64(old_last_vcn - 1);
-undo_alloc:
-	if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
-		ntfs_error(vol->sb, "Failed to free clusters from mft data "
-				"attribute.%s", es);
-		NVolSetErrors(vol);
-	}
-
-	if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
-		ntfs_error(vol->sb, "Failed to truncate mft data attribute "
-				"runlist.%s", es);
-		NVolSetErrors(vol);
-	}
-	if (ctx) {
-		a = ctx->attr;
-		if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
-			if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset),
-				old_alen - le16_to_cpu(
-					a->data.non_resident.mapping_pairs_offset),
-				rl2, ll, -1, NULL)) {
-				ntfs_error(vol->sb, "Failed to restore mapping pairs "
-					"array.%s", es);
-				NVolSetErrors(vol);
-			}
-			if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
-				ntfs_error(vol->sb, "Failed to restore attribute "
-					"record.%s", es);
-				NVolSetErrors(vol);
-			}
-			flush_dcache_mft_record_page(ctx->ntfs_ino);
-			mark_mft_record_dirty(ctx->ntfs_ino);
-		} else if (IS_ERR(ctx->mrec)) {
-			ntfs_error(vol->sb, "Failed to restore attribute search "
-				"context.%s", es);
-			NVolSetErrors(vol);
-		}
-		ntfs_attr_put_search_ctx(ctx);
-	}
-	if (!IS_ERR(mrec))
-		unmap_mft_record(mft_ni);
-	up_write(&mft_ni->runlist.lock);
-	return ret;
-}
-
-/**
- * ntfs_mft_record_layout - layout an mft record into a memory buffer
- * @vol:	volume to which the mft record will belong
- * @mft_no:	mft reference specifying the mft record number
- * @m:		destination buffer of size >= @vol->mft_record_size bytes
- *
- * Layout an empty, unused mft record with the mft record number @mft_no into
- * the buffer @m.  The volume @vol is needed because the mft record structure
- * was modified in NTFS 3.1 so we need to know which volume version this mft
- * record will be used on.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
-		MFT_RECORD *m)
-{
-	ATTR_RECORD *a;
-
-	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
-	if (mft_no >= (1ll << 32)) {
-		ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
-				"maximum of 2^32.", (long long)mft_no);
-		return -ERANGE;
-	}
-	/* Start by clearing the whole mft record to gives us a clean slate. */
-	memset(m, 0, vol->mft_record_size);
-	/* Aligned to 2-byte boundary. */
-	if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
-		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
-	else {
-		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
-		/*
-		 * Set the NTFS 3.1+ specific fields while we know that the
-		 * volume version is 3.1+.
-		 */
-		m->reserved = 0;
-		m->mft_record_number = cpu_to_le32((u32)mft_no);
-	}
-	m->magic = magic_FILE;
-	if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
-		m->usa_count = cpu_to_le16(vol->mft_record_size /
-				NTFS_BLOCK_SIZE + 1);
-	else {
-		m->usa_count = cpu_to_le16(1);
-		ntfs_warning(vol->sb, "Sector size is bigger than mft record "
-				"size.  Setting usa_count to 1.  If chkdsk "
-				"reports this as corruption, please email "
-				"linux-ntfs-dev@lists.sourceforge.net stating "
-				"that you saw this message and that the "
-				"modified filesystem created was corrupt.  "
-				"Thank you.");
-	}
-	/* Set the update sequence number to 1. */
-	*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
-	m->lsn = 0;
-	m->sequence_number = cpu_to_le16(1);
-	m->link_count = 0;
-	/*
-	 * Place the attributes straight after the update sequence array,
-	 * aligned to 8-byte boundary.
-	 */
-	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
-			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
-	m->flags = 0;
-	/*
-	 * Using attrs_offset plus eight bytes (for the termination attribute).
-	 * attrs_offset is already aligned to 8-byte boundary, so no need to
-	 * align again.
-	 */
-	m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
-	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
-	m->base_mft_record = 0;
-	m->next_attr_instance = 0;
-	/* Add the termination attribute. */
-	a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
-	a->type = AT_END;
-	a->length = 0;
-	ntfs_debug("Done.");
-	return 0;
-}
-
-/**
- * ntfs_mft_record_format - format an mft record on an ntfs volume
- * @vol:	volume on which to format the mft record
- * @mft_no:	mft record number to format
- *
- * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
- * mft record into the appropriate place of the mft data attribute.  This is
- * used when extending the mft data attribute.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
-{
-	loff_t i_size;
-	struct inode *mft_vi = vol->mft_ino;
-	struct page *page;
-	MFT_RECORD *m;
-	pgoff_t index, end_index;
-	unsigned int ofs;
-	int err;
-
-	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
-	/*
-	 * The index into the page cache and the offset within the page cache
-	 * page of the wanted mft record.
-	 */
-	index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
-	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
-	/* The maximum valid index into the page cache for $MFT's data. */
-	i_size = i_size_read(mft_vi);
-	end_index = i_size >> PAGE_SHIFT;
-	if (unlikely(index >= end_index)) {
-		if (unlikely(index > end_index || ofs + vol->mft_record_size >=
-				(i_size & ~PAGE_MASK))) {
-			ntfs_error(vol->sb, "Tried to format non-existing mft "
-					"record 0x%llx.", (long long)mft_no);
-			return -ENOENT;
-		}
-	}
-	/* Read, map, and pin the page containing the mft record. */
-	page = ntfs_map_page(mft_vi->i_mapping, index);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to map page containing mft record "
-				"to format 0x%llx.", (long long)mft_no);
-		return PTR_ERR(page);
-	}
-	lock_page(page);
-	BUG_ON(!PageUptodate(page));
-	ClearPageUptodate(page);
-	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
-	err = ntfs_mft_record_layout(vol, mft_no, m);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
-				(long long)mft_no);
-		SetPageUptodate(page);
-		unlock_page(page);
-		ntfs_unmap_page(page);
-		return err;
-	}
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	unlock_page(page);
-	/*
-	 * Make sure the mft record is written out to disk.  We could use
-	 * ilookup5() to check if an inode is in icache and so on but this is
-	 * unnecessary as ntfs_writepage() will write the dirty record anyway.
-	 */
-	mark_ntfs_record_dirty(page, ofs);
-	ntfs_unmap_page(page);
-	ntfs_debug("Done.");
-	return 0;
-}
-
-/**
- * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
- * @vol:	[IN]  volume on which to allocate the mft record
- * @mode:	[IN]  mode if want a file or directory, i.e. base inode or 0
- * @base_ni:	[IN]  open base inode if allocating an extent mft record or NULL
- * @mrec:	[OUT] on successful return this is the mapped mft record
- *
- * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
- *
- * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
- * direvctory inode, and allocate it at the default allocator position.  In
- * this case @mode is the file mode as given to us by the caller.  We in
- * particular use @mode to distinguish whether a file or a directory is being
- * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
- *
- * If @base_ni is not NULL make the allocated mft record an extent record,
- * allocate it starting at the mft record after the base mft record and attach
- * the allocated and opened ntfs inode to the base inode @base_ni.  In this
- * case @mode must be 0 as it is meaningless for extent inodes.
- *
- * You need to check the return value with IS_ERR().  If false, the function
- * was successful and the return value is the now opened ntfs inode of the
- * allocated mft record.  *@mrec is then set to the allocated, mapped, pinned,
- * and locked mft record.  If IS_ERR() is true, the function failed and the
- * error code is obtained from PTR_ERR(return value).  *@mrec is undefined in
- * this case.
- *
- * Allocation strategy:
- *
- * To find a free mft record, we scan the mft bitmap for a zero bit.  To
- * optimize this we start scanning at the place specified by @base_ni or if
- * @base_ni is NULL we start where we last stopped and we perform wrap around
- * when we reach the end.  Note, we do not try to allocate mft records below
- * number 24 because numbers 0 to 15 are the defined system files anyway and 16
- * to 24 are special in that they are used for storing extension mft records
- * for the $DATA attribute of $MFT.  This is required to avoid the possibility
- * of creating a runlist with a circular dependency which once written to disk
- * can never be read in again.  Windows will only use records 16 to 24 for
- * normal files if the volume is completely out of space.  We never use them
- * which means that when the volume is really out of space we cannot create any
- * more files while Windows can still create up to 8 small files.  We can start
- * doing this at some later time, it does not matter much for now.
- *
- * When scanning the mft bitmap, we only search up to the last allocated mft
- * record.  If there are no free records left in the range 24 to number of
- * allocated mft records, then we extend the $MFT/$DATA attribute in order to
- * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
- * records at a time or one cluster, if cluster size is above 16kiB.  If there
- * is not sufficient space to do this, we try to extend by a single mft record
- * or one cluster, if cluster size is above the mft record size.
- *
- * No matter how many mft records we allocate, we initialize only the first
- * allocated mft record, incrementing mft data size and initialized size
- * accordingly, open an ntfs_inode for it and return it to the caller, unless
- * there are less than 24 mft records, in which case we allocate and initialize
- * mft records until we reach record 24 which we consider as the first free mft
- * record for use by normal files.
- *
- * If during any stage we overflow the initialized data in the mft bitmap, we
- * extend the initialized size (and data size) by 8 bytes, allocating another
- * cluster if required.  The bitmap data size has to be at least equal to the
- * number of mft records in the mft, but it can be bigger, in which case the
- * superflous bits are padded with zeroes.
- *
- * Thus, when we return successfully (IS_ERR() is false), we will have:
- *	- initialized / extended the mft bitmap if necessary,
- *	- initialized / extended the mft data if necessary,
- *	- set the bit corresponding to the mft record being allocated in the
- *	  mft bitmap,
- *	- opened an ntfs_inode for the allocated mft record, and we will have
- *	- returned the ntfs_inode as well as the allocated mapped, pinned, and
- *	  locked mft record.
- *
- * On error, the volume will be left in a consistent state and no record will
- * be allocated.  If rolling back a partial operation fails, we may leave some
- * inconsistent metadata in which case we set NVolErrors() so the volume is
- * left dirty when unmounted.
- *
- * Note, this function cannot make use of most of the normal functions, like
- * for example for attribute resizing, etc, because when the run list overflows
- * the base mft record and an attribute list is used, it is very important that
- * the extension mft records used to store the $DATA attribute of $MFT can be
- * reached without having to read the information contained inside them, as
- * this would make it impossible to find them in the first place after the
- * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
- * rule because the bitmap is not essential for finding the mft records, but on
- * the other hand, handling the bitmap in this special way would make life
- * easier because otherwise there might be circular invocations of functions
- * when reading the bitmap.
- */
-ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
-		ntfs_inode *base_ni, MFT_RECORD **mrec)
-{
-	s64 ll, bit, old_data_initialized, old_data_size;
-	unsigned long flags;
-	struct inode *vi;
-	struct page *page;
-	ntfs_inode *mft_ni, *mftbmp_ni, *ni;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	pgoff_t index;
-	unsigned int ofs;
-	int err;
-	le16 seq_no, usn;
-	bool record_formatted = false;
-
-	if (base_ni) {
-		ntfs_debug("Entering (allocating an extent mft record for "
-				"base mft record 0x%llx).",
-				(long long)base_ni->mft_no);
-		/* @mode and @base_ni are mutually exclusive. */
-		BUG_ON(mode);
-	} else
-		ntfs_debug("Entering (allocating a base mft record).");
-	if (mode) {
-		/* @mode and @base_ni are mutually exclusive. */
-		BUG_ON(base_ni);
-		/* We only support creation of normal files and directories. */
-		if (!S_ISREG(mode) && !S_ISDIR(mode))
-			return ERR_PTR(-EOPNOTSUPP);
-	}
-	BUG_ON(!mrec);
-	mft_ni = NTFS_I(vol->mft_ino);
-	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
-	down_write(&vol->mftbmp_lock);
-	bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
-	if (bit >= 0) {
-		ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
-				(long long)bit);
-		goto have_alloc_rec;
-	}
-	if (bit != -ENOSPC) {
-		up_write(&vol->mftbmp_lock);
-		return ERR_PTR(bit);
-	}
-	/*
-	 * No free mft records left.  If the mft bitmap already covers more
-	 * than the currently used mft records, the next records are all free,
-	 * so we can simply allocate the first unused mft record.
-	 * Note: We also have to make sure that the mft bitmap at least covers
-	 * the first 24 mft records as they are special and whilst they may not
-	 * be in use, we do not allocate from them.
-	 */
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	old_data_initialized = mftbmp_ni->initialized_size;
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
-		bit = ll;
-		if (bit < 24)
-			bit = 24;
-		if (unlikely(bit >= (1ll << 32)))
-			goto max_err_out;
-		ntfs_debug("Found free record (#2), bit 0x%llx.",
-				(long long)bit);
-		goto found_free_rec;
-	}
-	/*
-	 * The mft bitmap needs to be expanded until it covers the first unused
-	 * mft record that we can allocate.
-	 * Note: The smallest mft record we allocate is mft record 24.
-	 */
-	bit = old_data_initialized << 3;
-	if (unlikely(bit >= (1ll << 32)))
-		goto max_err_out;
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	old_data_size = mftbmp_ni->allocated_size;
-	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
-			"data_size 0x%llx, initialized_size 0x%llx.",
-			(long long)old_data_size,
-			(long long)i_size_read(vol->mftbmp_ino),
-			(long long)old_data_initialized);
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	if (old_data_initialized + 8 > old_data_size) {
-		/* Need to extend bitmap by one more cluster. */
-		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
-		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
-		if (unlikely(err)) {
-			up_write(&vol->mftbmp_lock);
-			goto err_out;
-		}
-#ifdef DEBUG
-		read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-		ntfs_debug("Status of mftbmp after allocation extension: "
-				"allocated_size 0x%llx, data_size 0x%llx, "
-				"initialized_size 0x%llx.",
-				(long long)mftbmp_ni->allocated_size,
-				(long long)i_size_read(vol->mftbmp_ino),
-				(long long)mftbmp_ni->initialized_size);
-		read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
-	}
-	/*
-	 * We now have sufficient allocated space, extend the initialized_size
-	 * as well as the data_size if necessary and fill the new space with
-	 * zeroes.
-	 */
-	err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
-	if (unlikely(err)) {
-		up_write(&vol->mftbmp_lock);
-		goto err_out;
-	}
-#ifdef DEBUG
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	ntfs_debug("Status of mftbmp after initialized extension: "
-			"allocated_size 0x%llx, data_size 0x%llx, "
-			"initialized_size 0x%llx.",
-			(long long)mftbmp_ni->allocated_size,
-			(long long)i_size_read(vol->mftbmp_ino),
-			(long long)mftbmp_ni->initialized_size);
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
-	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
-found_free_rec:
-	/* @bit is the found free mft record, allocate it in the mft bitmap. */
-	ntfs_debug("At found_free_rec.");
-	err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
-		up_write(&vol->mftbmp_lock);
-		goto err_out;
-	}
-	ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
-have_alloc_rec:
-	/*
-	 * The mft bitmap is now uptodate.  Deal with mft data attribute now.
-	 * Note, we keep hold of the mft bitmap lock for writing until all
-	 * modifications to the mft data attribute are complete, too, as they
-	 * will impact decisions for mft bitmap and mft record allocation done
-	 * by a parallel allocation and if the lock is not maintained a
-	 * parallel allocation could allocate the same mft record as this one.
-	 */
-	ll = (bit + 1) << vol->mft_record_size_bits;
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	old_data_initialized = mft_ni->initialized_size;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	if (ll <= old_data_initialized) {
-		ntfs_debug("Allocated mft record already initialized.");
-		goto mft_rec_already_initialized;
-	}
-	ntfs_debug("Initializing allocated mft record.");
-	/*
-	 * The mft record is outside the initialized data.  Extend the mft data
-	 * attribute until it covers the allocated record.  The loop is only
-	 * actually traversed more than once when a freshly formatted volume is
-	 * first written to so it optimizes away nicely in the common case.
-	 */
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ntfs_debug("Status of mft data before extension: "
-			"allocated_size 0x%llx, data_size 0x%llx, "
-			"initialized_size 0x%llx.",
-			(long long)mft_ni->allocated_size,
-			(long long)i_size_read(vol->mft_ino),
-			(long long)mft_ni->initialized_size);
-	while (ll > mft_ni->allocated_size) {
-		read_unlock_irqrestore(&mft_ni->size_lock, flags);
-		err = ntfs_mft_data_extend_allocation_nolock(vol);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Failed to extend mft data "
-					"allocation.");
-			goto undo_mftbmp_alloc_nolock;
-		}
-		read_lock_irqsave(&mft_ni->size_lock, flags);
-		ntfs_debug("Status of mft data after allocation extension: "
-				"allocated_size 0x%llx, data_size 0x%llx, "
-				"initialized_size 0x%llx.",
-				(long long)mft_ni->allocated_size,
-				(long long)i_size_read(vol->mft_ino),
-				(long long)mft_ni->initialized_size);
-	}
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	/*
-	 * Extend mft data initialized size (and data size of course) to reach
-	 * the allocated mft record, formatting the mft records allong the way.
-	 * Note: We only modify the ntfs_inode structure as that is all that is
-	 * needed by ntfs_mft_record_format().  We will update the attribute
-	 * record itself in one fell swoop later on.
-	 */
-	write_lock_irqsave(&mft_ni->size_lock, flags);
-	old_data_initialized = mft_ni->initialized_size;
-	old_data_size = vol->mft_ino->i_size;
-	while (ll > mft_ni->initialized_size) {
-		s64 new_initialized_size, mft_no;
-		
-		new_initialized_size = mft_ni->initialized_size +
-				vol->mft_record_size;
-		mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
-		if (new_initialized_size > i_size_read(vol->mft_ino))
-			i_size_write(vol->mft_ino, new_initialized_size);
-		write_unlock_irqrestore(&mft_ni->size_lock, flags);
-		ntfs_debug("Initializing mft record 0x%llx.",
-				(long long)mft_no);
-		err = ntfs_mft_record_format(vol, mft_no);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Failed to format mft record.");
-			goto undo_data_init;
-		}
-		write_lock_irqsave(&mft_ni->size_lock, flags);
-		mft_ni->initialized_size = new_initialized_size;
-	}
-	write_unlock_irqrestore(&mft_ni->size_lock, flags);
-	record_formatted = true;
-	/* Update the mft data attribute record to reflect the new sizes. */
-	m = map_mft_record(mft_ni);
-	if (IS_ERR(m)) {
-		ntfs_error(vol->sb, "Failed to map mft record.");
-		err = PTR_ERR(m);
-		goto undo_data_init;
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, m);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.");
-		err = -ENOMEM;
-		unmap_mft_record(mft_ni);
-		goto undo_data_init;
-	}
-	err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Failed to find first attribute extent of "
-				"mft data attribute.");
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(mft_ni);
-		goto undo_data_init;
-	}
-	a = ctx->attr;
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	a->data.non_resident.initialized_size =
-			cpu_to_sle64(mft_ni->initialized_size);
-	a->data.non_resident.data_size =
-			cpu_to_sle64(i_size_read(vol->mft_ino));
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ntfs_debug("Status of mft data after mft record initialization: "
-			"allocated_size 0x%llx, data_size 0x%llx, "
-			"initialized_size 0x%llx.",
-			(long long)mft_ni->allocated_size,
-			(long long)i_size_read(vol->mft_ino),
-			(long long)mft_ni->initialized_size);
-	BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
-	BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-mft_rec_already_initialized:
-	/*
-	 * We can finally drop the mft bitmap lock as the mft data attribute
-	 * has been fully updated.  The only disparity left is that the
-	 * allocated mft record still needs to be marked as in use to match the
-	 * set bit in the mft bitmap but this is actually not a problem since
-	 * this mft record is not referenced from anywhere yet and the fact
-	 * that it is allocated in the mft bitmap means that no-one will try to
-	 * allocate it either.
-	 */
-	up_write(&vol->mftbmp_lock);
-	/*
-	 * We now have allocated and initialized the mft record.  Calculate the
-	 * index of and the offset within the page cache page the record is in.
-	 */
-	index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
-	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
-	/* Read, map, and pin the page containing the mft record. */
-	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to map page containing allocated "
-				"mft record 0x%llx.", (long long)bit);
-		err = PTR_ERR(page);
-		goto undo_mftbmp_alloc;
-	}
-	lock_page(page);
-	BUG_ON(!PageUptodate(page));
-	ClearPageUptodate(page);
-	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
-	/* If we just formatted the mft record no need to do it again. */
-	if (!record_formatted) {
-		/* Sanity check that the mft record is really not in use. */
-		if (ntfs_is_file_record(m->magic) &&
-				(m->flags & MFT_RECORD_IN_USE)) {
-			ntfs_error(vol->sb, "Mft record 0x%llx was marked "
-					"free in mft bitmap but is marked "
-					"used itself.  Corrupt filesystem.  "
-					"Unmount and run chkdsk.",
-					(long long)bit);
-			err = -EIO;
-			SetPageUptodate(page);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			NVolSetErrors(vol);
-			goto undo_mftbmp_alloc;
-		}
-		/*
-		 * We need to (re-)format the mft record, preserving the
-		 * sequence number if it is not zero as well as the update
-		 * sequence number if it is not zero or -1 (0xffff).  This
-		 * means we do not need to care whether or not something went
-		 * wrong with the previous mft record.
-		 */
-		seq_no = m->sequence_number;
-		usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
-		err = ntfs_mft_record_layout(vol, bit, m);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Failed to layout allocated mft "
-					"record 0x%llx.", (long long)bit);
-			SetPageUptodate(page);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto undo_mftbmp_alloc;
-		}
-		if (seq_no)
-			m->sequence_number = seq_no;
-		if (usn && le16_to_cpu(usn) != 0xffff)
-			*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
-	}
-	/* Set the mft record itself in use. */
-	m->flags |= MFT_RECORD_IN_USE;
-	if (S_ISDIR(mode))
-		m->flags |= MFT_RECORD_IS_DIRECTORY;
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	if (base_ni) {
-		MFT_RECORD *m_tmp;
-
-		/*
-		 * Setup the base mft record in the extent mft record.  This
-		 * completes initialization of the allocated extent mft record
-		 * and we can simply use it with map_extent_mft_record().
-		 */
-		m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
-				base_ni->seq_no);
-		/*
-		 * Allocate an extent inode structure for the new mft record,
-		 * attach it to the base inode @base_ni and map, pin, and lock
-		 * its, i.e. the allocated, mft record.
-		 */
-		m_tmp = map_extent_mft_record(base_ni, bit, &ni);
-		if (IS_ERR(m_tmp)) {
-			ntfs_error(vol->sb, "Failed to map allocated extent "
-					"mft record 0x%llx.", (long long)bit);
-			err = PTR_ERR(m_tmp);
-			/* Set the mft record itself not in use. */
-			m->flags &= cpu_to_le16(
-					~le16_to_cpu(MFT_RECORD_IN_USE));
-			flush_dcache_page(page);
-			/* Make sure the mft record is written out to disk. */
-			mark_ntfs_record_dirty(page, ofs);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto undo_mftbmp_alloc;
-		}
-		BUG_ON(m != m_tmp);
-		/*
-		 * Make sure the allocated mft record is written out to disk.
-		 * No need to set the inode dirty because the caller is going
-		 * to do that anyway after finishing with the new extent mft
-		 * record (e.g. at a minimum a new attribute will be added to
-		 * the mft record.
-		 */
-		mark_ntfs_record_dirty(page, ofs);
-		unlock_page(page);
-		/*
-		 * Need to unmap the page since map_extent_mft_record() mapped
-		 * it as well so we have it mapped twice at the moment.
-		 */
-		ntfs_unmap_page(page);
-	} else {
-		/*
-		 * Allocate a new VFS inode and set it up.  NOTE: @vi->i_nlink
-		 * is set to 1 but the mft record->link_count is 0.  The caller
-		 * needs to bear this in mind.
-		 */
-		vi = new_inode(vol->sb);
-		if (unlikely(!vi)) {
-			err = -ENOMEM;
-			/* Set the mft record itself not in use. */
-			m->flags &= cpu_to_le16(
-					~le16_to_cpu(MFT_RECORD_IN_USE));
-			flush_dcache_page(page);
-			/* Make sure the mft record is written out to disk. */
-			mark_ntfs_record_dirty(page, ofs);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto undo_mftbmp_alloc;
-		}
-		vi->i_ino = bit;
-
-		/* The owner and group come from the ntfs volume. */
-		vi->i_uid = vol->uid;
-		vi->i_gid = vol->gid;
-
-		/* Initialize the ntfs specific part of @vi. */
-		ntfs_init_big_inode(vi);
-		ni = NTFS_I(vi);
-		/*
-		 * Set the appropriate mode, attribute type, and name.  For
-		 * directories, also setup the index values to the defaults.
-		 */
-		if (S_ISDIR(mode)) {
-			vi->i_mode = S_IFDIR | S_IRWXUGO;
-			vi->i_mode &= ~vol->dmask;
-
-			NInoSetMstProtected(ni);
-			ni->type = AT_INDEX_ALLOCATION;
-			ni->name = I30;
-			ni->name_len = 4;
-
-			ni->itype.index.block_size = 4096;
-			ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
-			ni->itype.index.collation_rule = COLLATION_FILE_NAME;
-			if (vol->cluster_size <= ni->itype.index.block_size) {
-				ni->itype.index.vcn_size = vol->cluster_size;
-				ni->itype.index.vcn_size_bits =
-						vol->cluster_size_bits;
-			} else {
-				ni->itype.index.vcn_size = vol->sector_size;
-				ni->itype.index.vcn_size_bits =
-						vol->sector_size_bits;
-			}
-		} else {
-			vi->i_mode = S_IFREG | S_IRWXUGO;
-			vi->i_mode &= ~vol->fmask;
-
-			ni->type = AT_DATA;
-			ni->name = NULL;
-			ni->name_len = 0;
-		}
-		if (IS_RDONLY(vi))
-			vi->i_mode &= ~S_IWUGO;
-
-		/* Set the inode times to the current time. */
-		simple_inode_init_ts(vi);
-		/*
-		 * Set the file size to 0, the ntfs inode sizes are set to 0 by
-		 * the call to ntfs_init_big_inode() below.
-		 */
-		vi->i_size = 0;
-		vi->i_blocks = 0;
-
-		/* Set the sequence number. */
-		vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-		/*
-		 * Manually map, pin, and lock the mft record as we already
-		 * have its page mapped and it is very easy to do.
-		 */
-		atomic_inc(&ni->count);
-		mutex_lock(&ni->mrec_lock);
-		ni->page = page;
-		ni->page_ofs = ofs;
-		/*
-		 * Make sure the allocated mft record is written out to disk.
-		 * NOTE: We do not set the ntfs inode dirty because this would
-		 * fail in ntfs_write_inode() because the inode does not have a
-		 * standard information attribute yet.  Also, there is no need
-		 * to set the inode dirty because the caller is going to do
-		 * that anyway after finishing with the new mft record (e.g. at
-		 * a minimum some new attributes will be added to the mft
-		 * record.
-		 */
-		mark_ntfs_record_dirty(page, ofs);
-		unlock_page(page);
-
-		/* Add the inode to the inode hash for the superblock. */
-		insert_inode_hash(vi);
-
-		/* Update the default mft allocation position. */
-		vol->mft_data_pos = bit + 1;
-	}
-	/*
-	 * Return the opened, allocated inode of the allocated mft record as
-	 * well as the mapped, pinned, and locked mft record.
-	 */
-	ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
-			base_ni ? "extent " : "", (long long)bit);
-	*mrec = m;
-	return ni;
-undo_data_init:
-	write_lock_irqsave(&mft_ni->size_lock, flags);
-	mft_ni->initialized_size = old_data_initialized;
-	i_size_write(vol->mft_ino, old_data_size);
-	write_unlock_irqrestore(&mft_ni->size_lock, flags);
-	goto undo_mftbmp_alloc_nolock;
-undo_mftbmp_alloc:
-	down_write(&vol->mftbmp_lock);
-undo_mftbmp_alloc_nolock:
-	if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
-		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
-		NVolSetErrors(vol);
-	}
-	up_write(&vol->mftbmp_lock);
-err_out:
-	return ERR_PTR(err);
-max_err_out:
-	ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
-			"number of inodes (2^32) has already been reached.");
-	up_write(&vol->mftbmp_lock);
-	return ERR_PTR(-ENOSPC);
-}
-
-/**
- * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
- * @ni:		ntfs inode of the mapped extent mft record to free
- * @m:		mapped extent mft record of the ntfs inode @ni
- *
- * Free the mapped extent mft record @m of the extent ntfs inode @ni.
- *
- * Note that this function unmaps the mft record and closes and destroys @ni
- * internally and hence you cannot use either @ni nor @m any more after this
- * function returns success.
- *
- * On success return 0 and on error return -errno.  @ni and @m are still valid
- * in this case and have not been freed.
- *
- * For some errors an error message is displayed and the success code 0 is
- * returned and the volume is then left dirty on umount.  This makes sense in
- * case we could not rollback the changes that were already done since the
- * caller no longer wants to reference this mft record so it does not matter to
- * the caller if something is wrong with it as long as it is properly detached
- * from the base inode.
- */
-int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
-{
-	unsigned long mft_no = ni->mft_no;
-	ntfs_volume *vol = ni->vol;
-	ntfs_inode *base_ni;
-	ntfs_inode **extent_nis;
-	int i, err;
-	le16 old_seq_no;
-	u16 seq_no;
-	
-	BUG_ON(NInoAttr(ni));
-	BUG_ON(ni->nr_extents != -1);
-
-	mutex_lock(&ni->extent_lock);
-	base_ni = ni->ext.base_ntfs_ino;
-	mutex_unlock(&ni->extent_lock);
-
-	BUG_ON(base_ni->nr_extents <= 0);
-
-	ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
-			mft_no, base_ni->mft_no);
-
-	mutex_lock(&base_ni->extent_lock);
-
-	/* Make sure we are holding the only reference to the extent inode. */
-	if (atomic_read(&ni->count) > 2) {
-		ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
-				"not freeing.", base_ni->mft_no);
-		mutex_unlock(&base_ni->extent_lock);
-		return -EBUSY;
-	}
-
-	/* Dissociate the ntfs inode from the base inode. */
-	extent_nis = base_ni->ext.extent_ntfs_inos;
-	err = -ENOENT;
-	for (i = 0; i < base_ni->nr_extents; i++) {
-		if (ni != extent_nis[i])
-			continue;
-		extent_nis += i;
-		base_ni->nr_extents--;
-		memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
-				sizeof(ntfs_inode*));
-		err = 0;
-		break;
-	}
-
-	mutex_unlock(&base_ni->extent_lock);
-
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
-				"its base inode 0x%lx.", mft_no,
-				base_ni->mft_no);
-		BUG();
-	}
-
-	/*
-	 * The extent inode is no longer attached to the base inode so no one
-	 * can get a reference to it any more.
-	 */
-
-	/* Mark the mft record as not in use. */
-	m->flags &= ~MFT_RECORD_IN_USE;
-
-	/* Increment the sequence number, skipping zero, if it is not zero. */
-	old_seq_no = m->sequence_number;
-	seq_no = le16_to_cpu(old_seq_no);
-	if (seq_no == 0xffff)
-		seq_no = 1;
-	else if (seq_no)
-		seq_no++;
-	m->sequence_number = cpu_to_le16(seq_no);
-
-	/*
-	 * Set the ntfs inode dirty and write it out.  We do not need to worry
-	 * about the base inode here since whatever caused the extent mft
-	 * record to be freed is guaranteed to do it already.
-	 */
-	NInoSetDirty(ni);
-	err = write_mft_record(ni, m, 0);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
-				"freeing.", mft_no);
-		goto rollback;
-	}
-rollback_error:
-	/* Unmap and throw away the now freed extent inode. */
-	unmap_extent_mft_record(ni);
-	ntfs_clear_extent_inode(ni);
-
-	/* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
-	down_write(&vol->mftbmp_lock);
-	err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
-	up_write(&vol->mftbmp_lock);
-	if (unlikely(err)) {
-		/*
-		 * The extent inode is gone but we failed to deallocate it in
-		 * the mft bitmap.  Just emit a warning and leave the volume
-		 * dirty on umount.
-		 */
-		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
-		NVolSetErrors(vol);
-	}
-	return 0;
-rollback:
-	/* Rollback what we did... */
-	mutex_lock(&base_ni->extent_lock);
-	extent_nis = base_ni->ext.extent_ntfs_inos;
-	if (!(base_ni->nr_extents & 3)) {
-		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
-
-		extent_nis = kmalloc(new_size, GFP_NOFS);
-		if (unlikely(!extent_nis)) {
-			ntfs_error(vol->sb, "Failed to allocate internal "
-					"buffer during rollback.%s", es);
-			mutex_unlock(&base_ni->extent_lock);
-			NVolSetErrors(vol);
-			goto rollback_error;
-		}
-		if (base_ni->nr_extents) {
-			BUG_ON(!base_ni->ext.extent_ntfs_inos);
-			memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
-					new_size - 4 * sizeof(ntfs_inode*));
-			kfree(base_ni->ext.extent_ntfs_inos);
-		}
-		base_ni->ext.extent_ntfs_inos = extent_nis;
-	}
-	m->flags |= MFT_RECORD_IN_USE;
-	m->sequence_number = old_seq_no;
-	extent_nis[base_ni->nr_extents++] = ni;
-	mutex_unlock(&base_ni->extent_lock);
-	mark_mft_record_dirty(ni);
-	return err;
-}
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
deleted file mode 100644
index 49c001af16ed..000000000000
--- a/fs/ntfs/mft.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
- *	   Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_MFT_H
-#define _LINUX_NTFS_MFT_H
-
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-
-#include "inode.h"
-
-extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
-extern void unmap_mft_record(ntfs_inode *ni);
-
-extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
-		ntfs_inode **ntfs_ino);
-
-static inline void unmap_extent_mft_record(ntfs_inode *ni)
-{
-	unmap_mft_record(ni);
-	return;
-}
-
-#ifdef NTFS_RW
-
-/**
- * flush_dcache_mft_record_page - flush_dcache_page() for mft records
- * @ni:		ntfs inode structure of mft record
- *
- * Call flush_dcache_page() for the page in which an mft record resides.
- *
- * This must be called every time an mft record is modified, just after the
- * modification.
- */
-static inline void flush_dcache_mft_record_page(ntfs_inode *ni)
-{
-	flush_dcache_page(ni->page);
-}
-
-extern void __mark_mft_record_dirty(ntfs_inode *ni);
-
-/**
- * mark_mft_record_dirty - set the mft record and the page containing it dirty
- * @ni:		ntfs inode describing the mapped mft record
- *
- * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
- * as well as the page containing the mft record, dirty.  Also, mark the base
- * vfs inode dirty.  This ensures that any changes to the mft record are
- * written out to disk.
- *
- * NOTE:  Do not do anything if the mft record is already marked dirty.
- */
-static inline void mark_mft_record_dirty(ntfs_inode *ni)
-{
-	if (!NInoTestSetDirty(ni))
-		__mark_mft_record_dirty(ni);
-}
-
-extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
-		MFT_RECORD *m, int sync);
-
-extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
-
-/**
- * write_mft_record - write out a mapped (extent) mft record
- * @ni:		ntfs inode describing the mapped (extent) mft record
- * @m:		mapped (extent) mft record to write
- * @sync:	if true, wait for i/o completion
- *
- * This is just a wrapper for write_mft_record_nolock() (see mft.c), which
- * locks the page for the duration of the write.  This ensures that there are
- * no race conditions between writing the mft record via the dirty inode code
- * paths and via the page cache write back code paths or between writing
- * neighbouring mft records residing in the same page.
- *
- * Locking the page also serializes us against ->read_folio() if the page is not
- * uptodate.
- *
- * On success, clean the mft record and return 0.  On error, leave the mft
- * record dirty and return -errno.
- */
-static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
-{
-	struct page *page = ni->page;
-	int err;
-
-	BUG_ON(!page);
-	lock_page(page);
-	err = write_mft_record_nolock(ni, m, sync);
-	unlock_page(page);
-	return err;
-}
-
-extern bool ntfs_may_write_mft_record(ntfs_volume *vol,
-		const unsigned long mft_no, const MFT_RECORD *m,
-		ntfs_inode **locked_ni);
-
-extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
-		ntfs_inode *base_ni, MFT_RECORD **mrec);
-extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_MFT_H */
diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c
deleted file mode 100644
index 16b3c884abfc..000000000000
--- a/fs/ntfs/mst.c
+++ /dev/null
@@ -1,189 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * mst.c - NTFS multi sector transfer protection handling code. Part of the
- *	   Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#include "ntfs.h"
-
-/**
- * post_read_mst_fixup - deprotect multi sector transfer protected data
- * @b:		pointer to the data to deprotect
- * @size:	size in bytes of @b
- *
- * Perform the necessary post read multi sector transfer fixup and detect the
- * presence of incomplete multi sector transfers. - In that case, overwrite the
- * magic of the ntfs record header being processed with "BAAD" (in memory only!)
- * and abort processing.
- *
- * Return 0 on success and -EINVAL on error ("BAAD" magic will be present).
- *
- * NOTE: We consider the absence / invalidity of an update sequence array to
- * mean that the structure is not protected at all and hence doesn't need to
- * be fixed up. Thus, we return success and not failure in this case. This is
- * in contrast to pre_write_mst_fixup(), see below.
- */
-int post_read_mst_fixup(NTFS_RECORD *b, const u32 size)
-{
-	u16 usa_ofs, usa_count, usn;
-	u16 *usa_pos, *data_pos;
-
-	/* Setup the variables. */
-	usa_ofs = le16_to_cpu(b->usa_ofs);
-	/* Decrement usa_count to get number of fixups. */
-	usa_count = le16_to_cpu(b->usa_count) - 1;
-	/* Size and alignment checks. */
-	if ( size & (NTFS_BLOCK_SIZE - 1)	||
-	     usa_ofs & 1			||
-	     usa_ofs + (usa_count * 2) > size	||
-	     (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
-		return 0;
-	/* Position of usn in update sequence array. */
-	usa_pos = (u16*)b + usa_ofs/sizeof(u16);
-	/*
-	 * The update sequence number which has to be equal to each of the
-	 * u16 values before they are fixed up. Note no need to care for
-	 * endianness since we are comparing and moving data for on disk
-	 * structures which means the data is consistent. - If it is
-	 * consistenty the wrong endianness it doesn't make any difference.
-	 */
-	usn = *usa_pos;
-	/*
-	 * Position in protected data of first u16 that needs fixing up.
-	 */
-	data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
-	/*
-	 * Check for incomplete multi sector transfer(s).
-	 */
-	while (usa_count--) {
-		if (*data_pos != usn) {
-			/*
-			 * Incomplete multi sector transfer detected! )-:
-			 * Set the magic to "BAAD" and return failure.
-			 * Note that magic_BAAD is already converted to le32.
-			 */
-			b->magic = magic_BAAD;
-			return -EINVAL;
-		}
-		data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
-	}
-	/* Re-setup the variables. */
-	usa_count = le16_to_cpu(b->usa_count) - 1;
-	data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
-	/* Fixup all sectors. */
-	while (usa_count--) {
-		/*
-		 * Increment position in usa and restore original data from
-		 * the usa into the data buffer.
-		 */
-		*data_pos = *(++usa_pos);
-		/* Increment position in data as well. */
-		data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
-	}
-	return 0;
-}
-
-/**
- * pre_write_mst_fixup - apply multi sector transfer protection
- * @b:		pointer to the data to protect
- * @size:	size in bytes of @b
- *
- * Perform the necessary pre write multi sector transfer fixup on the data
- * pointer to by @b of @size.
- *
- * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed
- * (assumed not needed). This is in contrast to post_read_mst_fixup() above.
- *
- * NOTE: We consider the absence / invalidity of an update sequence array to
- * mean that the structure is not subject to protection and hence doesn't need
- * to be fixed up. This means that you have to create a valid update sequence
- * array header in the ntfs record before calling this function, otherwise it
- * will fail (the header needs to contain the position of the update sequence
- * array together with the number of elements in the array). You also need to
- * initialise the update sequence number before calling this function
- * otherwise a random word will be used (whatever was in the record at that
- * position at that time).
- */
-int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size)
-{
-	le16 *usa_pos, *data_pos;
-	u16 usa_ofs, usa_count, usn;
-	le16 le_usn;
-
-	/* Sanity check + only fixup if it makes sense. */
-	if (!b || ntfs_is_baad_record(b->magic) ||
-			ntfs_is_hole_record(b->magic))
-		return -EINVAL;
-	/* Setup the variables. */
-	usa_ofs = le16_to_cpu(b->usa_ofs);
-	/* Decrement usa_count to get number of fixups. */
-	usa_count = le16_to_cpu(b->usa_count) - 1;
-	/* Size and alignment checks. */
-	if ( size & (NTFS_BLOCK_SIZE - 1)	||
-	     usa_ofs & 1			||
-	     usa_ofs + (usa_count * 2) > size	||
-	     (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
-		return -EINVAL;
-	/* Position of usn in update sequence array. */
-	usa_pos = (le16*)((u8*)b + usa_ofs);
-	/*
-	 * Cyclically increment the update sequence number
-	 * (skipping 0 and -1, i.e. 0xffff).
-	 */
-	usn = le16_to_cpup(usa_pos) + 1;
-	if (usn == 0xffff || !usn)
-		usn = 1;
-	le_usn = cpu_to_le16(usn);
-	*usa_pos = le_usn;
-	/* Position in data of first u16 that needs fixing up. */
-	data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
-	/* Fixup all sectors. */
-	while (usa_count--) {
-		/*
-		 * Increment the position in the usa and save the
-		 * original data from the data buffer into the usa.
-		 */
-		*(++usa_pos) = *data_pos;
-		/* Apply fixup to data. */
-		*data_pos = le_usn;
-		/* Increment position in data as well. */
-		data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
-	}
-	return 0;
-}
-
-/**
- * post_write_mst_fixup - fast deprotect multi sector transfer protected data
- * @b:		pointer to the data to deprotect
- *
- * Perform the necessary post write multi sector transfer fixup, not checking
- * for any errors, because we assume we have just used pre_write_mst_fixup(),
- * thus the data will be fine or we would never have gotten here.
- */
-void post_write_mst_fixup(NTFS_RECORD *b)
-{
-	le16 *usa_pos, *data_pos;
-
-	u16 usa_ofs = le16_to_cpu(b->usa_ofs);
-	u16 usa_count = le16_to_cpu(b->usa_count) - 1;
-
-	/* Position of usn in update sequence array. */
-	usa_pos = (le16*)b + usa_ofs/sizeof(le16);
-
-	/* Position in protected data of first u16 that needs fixing up. */
-	data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
-
-	/* Fixup all sectors. */
-	while (usa_count--) {
-		/*
-		 * Increment position in usa and restore original data from
-		 * the usa into the data buffer.
-		 */
-		*data_pos = *(++usa_pos);
-
-		/* Increment position in data as well. */
-		data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
-	}
-}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
deleted file mode 100644
index d7498ddc4a72..000000000000
--- a/fs/ntfs/namei.c
+++ /dev/null
@@ -1,392 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
- *	     project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include <linux/dcache.h>
-#include <linux/exportfs.h>
-#include <linux/security.h>
-#include <linux/slab.h>
-
-#include "attrib.h"
-#include "debug.h"
-#include "dir.h"
-#include "mft.h"
-#include "ntfs.h"
-
-/**
- * ntfs_lookup - find the inode represented by a dentry in a directory inode
- * @dir_ino:	directory inode in which to look for the inode
- * @dent:	dentry representing the inode to look for
- * @flags:	lookup flags
- *
- * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
- * in the directory inode @dir_ino and if found attaches the inode to the
- * dentry @dent.
- *
- * In more detail, the dentry @dent specifies which inode to look for by
- * supplying the name of the inode in @dent->d_name.name. ntfs_lookup()
- * converts the name to Unicode and walks the contents of the directory inode
- * @dir_ino looking for the converted Unicode name. If the name is found in the
- * directory, the corresponding inode is loaded by calling ntfs_iget() on its
- * inode number and the inode is associated with the dentry @dent via a call to
- * d_splice_alias().
- *
- * If the name is not found in the directory, a NULL inode is inserted into the
- * dentry @dent via a call to d_add(). The dentry is then termed a negative
- * dentry.
- *
- * Only if an actual error occurs, do we return an error via ERR_PTR().
- *
- * In order to handle the case insensitivity issues of NTFS with regards to the
- * dcache and the dcache requiring only one dentry per directory, we deal with
- * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining
- * a case sensitive dcache. This means that we get the full benefit of dcache
- * speed when the file/directory is looked up with the same case as returned by
- * ->ntfs_readdir() but that a lookup for any other case (or for the short file
- * name) will not find anything in dcache and will enter ->ntfs_lookup()
- * instead, where we search the directory for a fully matching file name
- * (including case) and if that is not found, we search for a file name that
- * matches with different case and if that has non-POSIX semantics we return
- * that. We actually do only one search (case sensitive) and keep tabs on
- * whether we have found a case insensitive match in the process.
- *
- * To simplify matters for us, we do not treat the short vs long filenames as
- * two hard links but instead if the lookup matches a short filename, we
- * return the dentry for the corresponding long filename instead.
- *
- * There are three cases we need to distinguish here:
- *
- * 1) @dent perfectly matches (i.e. including case) a directory entry with a
- *    file name in the WIN32 or POSIX namespaces. In this case
- *    ntfs_lookup_inode_by_name() will return with name set to NULL and we
- *    just d_splice_alias() @dent.
- * 2) @dent matches (not including case) a directory entry with a file name in
- *    the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return
- *    with name set to point to a kmalloc()ed ntfs_name structure containing
- *    the properly cased little endian Unicode name. We convert the name to the
- *    current NLS code page, search if a dentry with this name already exists
- *    and if so return that instead of @dent.  At this point things are
- *    complicated by the possibility of 'disconnected' dentries due to NFS
- *    which we deal with appropriately (see the code comments).  The VFS will
- *    then destroy the old @dent and use the one we returned.  If a dentry is
- *    not found, we allocate a new one, d_splice_alias() it, and return it as
- *    above.
- * 3) @dent matches either perfectly or not (i.e. we don't care about case) a
- *    directory entry with a file name in the DOS namespace. In this case
- *    ntfs_lookup_inode_by_name() will return with name set to point to a
- *    kmalloc()ed ntfs_name structure containing the mft reference (cpu endian)
- *    of the inode. We use the mft reference to read the inode and to find the
- *    file name in the WIN32 namespace corresponding to the matched short file
- *    name. We then convert the name to the current NLS code page, and proceed
- *    searching for a dentry with this name, etc, as in case 2), above.
- *
- * Locking: Caller must hold i_mutex on the directory.
- */
-static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
-		unsigned int flags)
-{
-	ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
-	struct inode *dent_inode;
-	ntfschar *uname;
-	ntfs_name *name = NULL;
-	MFT_REF mref;
-	unsigned long dent_ino;
-	int uname_len;
-
-	ntfs_debug("Looking up %pd in directory inode 0x%lx.",
-			dent, dir_ino->i_ino);
-	/* Convert the name of the dentry to Unicode. */
-	uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
-			&uname);
-	if (uname_len < 0) {
-		if (uname_len != -ENAMETOOLONG)
-			ntfs_error(vol->sb, "Failed to convert name to "
-					"Unicode.");
-		return ERR_PTR(uname_len);
-	}
-	mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
-			&name);
-	kmem_cache_free(ntfs_name_cache, uname);
-	if (!IS_ERR_MREF(mref)) {
-		dent_ino = MREF(mref);
-		ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
-		dent_inode = ntfs_iget(vol->sb, dent_ino);
-		if (!IS_ERR(dent_inode)) {
-			/* Consistency check. */
-			if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
-					NTFS_I(dent_inode)->seq_no ||
-					dent_ino == FILE_MFT) {
-				/* Perfect WIN32/POSIX match. -- Case 1. */
-				if (!name) {
-					ntfs_debug("Done.  (Case 1.)");
-					return d_splice_alias(dent_inode, dent);
-				}
-				/*
-				 * We are too indented.  Handle imperfect
-				 * matches and short file names further below.
-				 */
-				goto handle_name;
-			}
-			ntfs_error(vol->sb, "Found stale reference to inode "
-					"0x%lx (reference sequence number = "
-					"0x%x, inode sequence number = 0x%x), "
-					"returning -EIO. Run chkdsk.",
-					dent_ino, MSEQNO(mref),
-					NTFS_I(dent_inode)->seq_no);
-			iput(dent_inode);
-			dent_inode = ERR_PTR(-EIO);
-		} else
-			ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
-					"error code %li.", dent_ino,
-					PTR_ERR(dent_inode));
-		kfree(name);
-		/* Return the error code. */
-		return ERR_CAST(dent_inode);
-	}
-	/* It is guaranteed that @name is no longer allocated at this point. */
-	if (MREF_ERR(mref) == -ENOENT) {
-		ntfs_debug("Entry was not found, adding negative dentry.");
-		/* The dcache will handle negative entries. */
-		d_add(dent, NULL);
-		ntfs_debug("Done.");
-		return NULL;
-	}
-	ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
-			"code %i.", -MREF_ERR(mref));
-	return ERR_PTR(MREF_ERR(mref));
-	// TODO: Consider moving this lot to a separate function! (AIA)
-handle_name:
-   {
-	MFT_RECORD *m;
-	ntfs_attr_search_ctx *ctx;
-	ntfs_inode *ni = NTFS_I(dent_inode);
-	int err;
-	struct qstr nls_name;
-
-	nls_name.name = NULL;
-	if (name->type != FILE_NAME_DOS) {			/* Case 2. */
-		ntfs_debug("Case 2.");
-		nls_name.len = (unsigned)ntfs_ucstonls(vol,
-				(ntfschar*)&name->name, name->len,
-				(unsigned char**)&nls_name.name, 0);
-		kfree(name);
-	} else /* if (name->type == FILE_NAME_DOS) */ {		/* Case 3. */
-		FILE_NAME_ATTR *fn;
-
-		ntfs_debug("Case 3.");
-		kfree(name);
-
-		/* Find the WIN32 name corresponding to the matched DOS name. */
-		ni = NTFS_I(dent_inode);
-		m = map_mft_record(ni);
-		if (IS_ERR(m)) {
-			err = PTR_ERR(m);
-			m = NULL;
-			ctx = NULL;
-			goto err_out;
-		}
-		ctx = ntfs_attr_get_search_ctx(ni, m);
-		if (unlikely(!ctx)) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-		do {
-			ATTR_RECORD *a;
-			u32 val_len;
-
-			err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0,
-					NULL, 0, ctx);
-			if (unlikely(err)) {
-				ntfs_error(vol->sb, "Inode corrupt: No WIN32 "
-						"namespace counterpart to DOS "
-						"file name. Run chkdsk.");
-				if (err == -ENOENT)
-					err = -EIO;
-				goto err_out;
-			}
-			/* Consistency checks. */
-			a = ctx->attr;
-			if (a->non_resident || a->flags)
-				goto eio_err_out;
-			val_len = le32_to_cpu(a->data.resident.value_length);
-			if (le16_to_cpu(a->data.resident.value_offset) +
-					val_len > le32_to_cpu(a->length))
-				goto eio_err_out;
-			fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu(
-					ctx->attr->data.resident.value_offset));
-			if ((u32)(fn->file_name_length * sizeof(ntfschar) +
-					sizeof(FILE_NAME_ATTR)) > val_len)
-				goto eio_err_out;
-		} while (fn->file_name_type != FILE_NAME_WIN32);
-
-		/* Convert the found WIN32 name to current NLS code page. */
-		nls_name.len = (unsigned)ntfs_ucstonls(vol,
-				(ntfschar*)&fn->file_name, fn->file_name_length,
-				(unsigned char**)&nls_name.name, 0);
-
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(ni);
-	}
-	m = NULL;
-	ctx = NULL;
-
-	/* Check if a conversion error occurred. */
-	if ((signed)nls_name.len < 0) {
-		err = (signed)nls_name.len;
-		goto err_out;
-	}
-	nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len);
-
-	dent = d_add_ci(dent, dent_inode, &nls_name);
-	kfree(nls_name.name);
-	return dent;
-
-eio_err_out:
-	ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
-	err = -EIO;
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(ni);
-	iput(dent_inode);
-	ntfs_error(vol->sb, "Failed, returning error code %i.", err);
-	return ERR_PTR(err);
-   }
-}
-
-/*
- * Inode operations for directories.
- */
-const struct inode_operations ntfs_dir_inode_ops = {
-	.lookup	= ntfs_lookup,	/* VFS: Lookup directory. */
-};
-
-/**
- * ntfs_get_parent - find the dentry of the parent of a given directory dentry
- * @child_dent:		dentry of the directory whose parent directory to find
- *
- * Find the dentry for the parent directory of the directory specified by the
- * dentry @child_dent.  This function is called from
- * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the
- * default ->decode_fh() which is export_decode_fh() in the same file.
- *
- * The code is based on the ext3 ->get_parent() implementation found in
- * fs/ext3/namei.c::ext3_get_parent().
- *
- * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down.
- *
- * Return the dentry of the parent directory on success or the error code on
- * error (IS_ERR() is true).
- */
-static struct dentry *ntfs_get_parent(struct dentry *child_dent)
-{
-	struct inode *vi = d_inode(child_dent);
-	ntfs_inode *ni = NTFS_I(vi);
-	MFT_RECORD *mrec;
-	ntfs_attr_search_ctx *ctx;
-	ATTR_RECORD *attr;
-	FILE_NAME_ATTR *fn;
-	unsigned long parent_ino;
-	int err;
-
-	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-	/* Get the mft record of the inode belonging to the child dentry. */
-	mrec = map_mft_record(ni);
-	if (IS_ERR(mrec))
-		return ERR_CAST(mrec);
-	/* Find the first file name attribute in the mft record. */
-	ctx = ntfs_attr_get_search_ctx(ni, mrec);
-	if (unlikely(!ctx)) {
-		unmap_mft_record(ni);
-		return ERR_PTR(-ENOMEM);
-	}
-try_next:
-	err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL,
-			0, ctx);
-	if (unlikely(err)) {
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(ni);
-		if (err == -ENOENT)
-			ntfs_error(vi->i_sb, "Inode 0x%lx does not have a "
-					"file name attribute.  Run chkdsk.",
-					vi->i_ino);
-		return ERR_PTR(err);
-	}
-	attr = ctx->attr;
-	if (unlikely(attr->non_resident))
-		goto try_next;
-	fn = (FILE_NAME_ATTR *)((u8 *)attr +
-			le16_to_cpu(attr->data.resident.value_offset));
-	if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) >
-			(u8*)attr + le32_to_cpu(attr->length)))
-		goto try_next;
-	/* Get the inode number of the parent directory. */
-	parent_ino = MREF_LE(fn->parent_directory);
-	/* Release the search context and the mft record of the child. */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ni);
-
-	return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
-}
-
-static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
-{
-	struct inode *inode;
-
-	inode = ntfs_iget(sb, ino);
-	if (!IS_ERR(inode)) {
-		if (is_bad_inode(inode) || inode->i_generation != generation) {
-			iput(inode);
-			inode = ERR_PTR(-ESTALE);
-		}
-	}
-
-	return inode;
-}
-
-static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-				    ntfs_nfs_get_inode);
-}
-
-static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-				    ntfs_nfs_get_inode);
-}
-
-/*
- * Export operations allowing NFS exporting of mounted NTFS partitions.
- *
- * We use the default ->encode_fh() for now.  Note that they
- * use 32 bits to store the inode number which is an unsigned long so on 64-bit
- * architectures is usually 64 bits so it would all fail horribly on huge
- * volumes.  I guess we need to define our own encode and decode fh functions
- * that store 64-bit inode numbers at some point but for now we will ignore the
- * problem...
- *
- * We also use the default ->get_name() helper (used by ->decode_fh() via
- * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
- * independent.
- *
- * The default ->get_parent() just returns -EACCES so we have to provide our
- * own and the default ->get_dentry() is incompatible with NTFS due to not
- * allowing the inode number 0 which is used in NTFS for the system file $MFT
- * and due to using iget() whereas NTFS needs ntfs_iget().
- */
-const struct export_operations ntfs_export_ops = {
-	.encode_fh	= generic_encode_ino32_fh,
-	.get_parent	= ntfs_get_parent,	/* Find the parent of a given
-						   directory. */
-	.fh_to_dentry	= ntfs_fh_to_dentry,
-	.fh_to_parent	= ntfs_fh_to_parent,
-};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
deleted file mode 100644
index e81376ea9152..000000000000
--- a/fs/ntfs/ntfs.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ntfs.h - Defines for NTFS Linux kernel driver.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- * Copyright (C) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_H
-#define _LINUX_NTFS_H
-
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
-#include <linux/nls.h>
-#include <linux/smp.h>
-#include <linux/pagemap.h>
-
-#include "types.h"
-#include "volume.h"
-#include "layout.h"
-
-typedef enum {
-	NTFS_BLOCK_SIZE		= 512,
-	NTFS_BLOCK_SIZE_BITS	= 9,
-	NTFS_SB_MAGIC		= 0x5346544e,	/* 'NTFS' */
-	NTFS_MAX_NAME_LEN	= 255,
-	NTFS_MAX_ATTR_NAME_LEN	= 255,
-	NTFS_MAX_CLUSTER_SIZE	= 64 * 1024,	/* 64kiB */
-	NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE,
-} NTFS_CONSTANTS;
-
-/* Global variables. */
-
-/* Slab caches (from super.c). */
-extern struct kmem_cache *ntfs_name_cache;
-extern struct kmem_cache *ntfs_inode_cache;
-extern struct kmem_cache *ntfs_big_inode_cache;
-extern struct kmem_cache *ntfs_attr_ctx_cache;
-extern struct kmem_cache *ntfs_index_ctx_cache;
-
-/* The various operations structs defined throughout the driver files. */
-extern const struct address_space_operations ntfs_normal_aops;
-extern const struct address_space_operations ntfs_compressed_aops;
-extern const struct address_space_operations ntfs_mst_aops;
-
-extern const struct  file_operations ntfs_file_ops;
-extern const struct inode_operations ntfs_file_inode_ops;
-
-extern const struct  file_operations ntfs_dir_ops;
-extern const struct inode_operations ntfs_dir_inode_ops;
-
-extern const struct  file_operations ntfs_empty_file_ops;
-extern const struct inode_operations ntfs_empty_inode_ops;
-
-extern const struct export_operations ntfs_export_ops;
-
-/**
- * NTFS_SB - return the ntfs volume given a vfs super block
- * @sb:		VFS super block
- *
- * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb.
- */
-static inline ntfs_volume *NTFS_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-/* Declarations of functions and global variables. */
-
-/* From fs/ntfs/compress.c */
-extern int ntfs_read_compressed_block(struct page *page);
-extern int allocate_compression_buffers(void);
-extern void free_compression_buffers(void);
-
-/* From fs/ntfs/super.c */
-#define default_upcase_len 0x10000
-extern struct mutex ntfs_lock;
-
-typedef struct {
-	int val;
-	char *str;
-} option_t;
-extern const option_t on_errors_arr[];
-
-/* From fs/ntfs/mst.c */
-extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
-extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
-extern void post_write_mst_fixup(NTFS_RECORD *b);
-
-/* From fs/ntfs/unistr.c */
-extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
-		const ntfschar *s2, size_t s2_len,
-		const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_size);
-extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
-		const ntfschar *name2, const u32 name2_len,
-		const int err_val, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n);
-extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
-		const ntfschar *upcase, const u32 upcase_size);
-extern void ntfs_upcase_name(ntfschar *name, u32 name_len,
-		const ntfschar *upcase, const u32 upcase_len);
-extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
-		const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
-		FILE_NAME_ATTR *file_name_attr2,
-		const int err_val, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
-		const int ins_len, ntfschar **outs);
-extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
-		const int ins_len, unsigned char **outs, int outs_len);
-
-/* From fs/ntfs/upcase.c */
-extern ntfschar *generate_default_upcase(void);
-
-static inline int ntfs_ffs(int x)
-{
-	int r = 1;
-
-	if (!x)
-		return 0;
-	if (!(x & 0xffff)) {
-		x >>= 16;
-		r += 16;
-	}
-	if (!(x & 0xff)) {
-		x >>= 8;
-		r += 8;
-	}
-	if (!(x & 0xf)) {
-		x >>= 4;
-		r += 4;
-	}
-	if (!(x & 3)) {
-		x >>= 2;
-		r += 2;
-	}
-	if (!(x & 1)) {
-		x >>= 1;
-		r += 1;
-	}
-	return r;
-}
-
-#endif /* _LINUX_NTFS_H */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
deleted file mode 100644
index 9160480222fd..000000000000
--- a/fs/ntfs/quota.c
+++ /dev/null
@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * quota.c - NTFS kernel quota ($Quota) handling.  Part of the Linux-NTFS
- *	     project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include "index.h"
-#include "quota.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/**
- * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
- * @vol:	ntfs volume on which to mark the quotas out of date
- *
- * Mark the quotas out of date on the ntfs volume @vol and return 'true' on
- * success and 'false' on error.
- */
-bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
-{
-	ntfs_index_context *ictx;
-	QUOTA_CONTROL_ENTRY *qce;
-	const le32 qid = QUOTA_DEFAULTS_ID;
-	int err;
-
-	ntfs_debug("Entering.");
-	if (NVolQuotaOutOfDate(vol))
-		goto done;
-	if (!vol->quota_ino || !vol->quota_q_ino) {
-		ntfs_error(vol->sb, "Quota inodes are not open.");
-		return false;
-	}
-	inode_lock(vol->quota_q_ino);
-	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
-	if (!ictx) {
-		ntfs_error(vol->sb, "Failed to get index context.");
-		goto err_out;
-	}
-	err = ntfs_index_lookup(&qid, sizeof(qid), ictx);
-	if (err) {
-		if (err == -ENOENT)
-			ntfs_error(vol->sb, "Quota defaults entry is not "
-					"present.");
-		else
-			ntfs_error(vol->sb, "Lookup of quota defaults entry "
-					"failed.");
-		goto err_out;
-	}
-	if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) {
-		ntfs_error(vol->sb, "Quota defaults entry size is invalid.  "
-				"Run chkdsk.");
-		goto err_out;
-	}
-	qce = (QUOTA_CONTROL_ENTRY*)ictx->data;
-	if (le32_to_cpu(qce->version) != QUOTA_VERSION) {
-		ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not "
-				"supported.", le32_to_cpu(qce->version));
-		goto err_out;
-	}
-	ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags));
-	/* If quotas are already marked out of date, no need to do anything. */
-	if (qce->flags & QUOTA_FLAG_OUT_OF_DATE)
-		goto set_done;
-	/*
-	 * If quota tracking is neither requested, nor enabled and there are no
-	 * pending deletes, no need to mark the quotas out of date.
-	 */
-	if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED |
-			QUOTA_FLAG_TRACKING_REQUESTED |
-			QUOTA_FLAG_PENDING_DELETES)))
-		goto set_done;
-	/*
-	 * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date.
-	 * This is verified on WinXP to be sufficient to cause windows to
-	 * rescan the volume on boot and update all quota entries.
-	 */
-	qce->flags |= QUOTA_FLAG_OUT_OF_DATE;
-	/* Ensure the modified flags are written to disk. */
-	ntfs_index_entry_flush_dcache_page(ictx);
-	ntfs_index_entry_mark_dirty(ictx);
-set_done:
-	ntfs_index_ctx_put(ictx);
-	inode_unlock(vol->quota_q_ino);
-	/*
-	 * We set the flag so we do not try to mark the quotas out of date
-	 * again on remount.
-	 */
-	NVolSetQuotaOutOfDate(vol);
-done:
-	ntfs_debug("Done.");
-	return true;
-err_out:
-	if (ictx)
-		ntfs_index_ctx_put(ictx);
-	inode_unlock(vol->quota_q_ino);
-	return false;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
deleted file mode 100644
index fe3132a3d6d2..000000000000
--- a/fs/ntfs/quota.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * quota.h - Defines for NTFS kernel quota ($Quota) handling.  Part of the
- *	     Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_QUOTA_H
-#define _LINUX_NTFS_QUOTA_H
-
-#ifdef NTFS_RW
-
-#include "types.h"
-#include "volume.h"
-
-extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_QUOTA_H */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
deleted file mode 100644
index 0d448e9881f7..000000000000
--- a/fs/ntfs/runlist.c
+++ /dev/null
@@ -1,1893 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002-2005 Richard Russon
- */
-
-#include "debug.h"
-#include "dir.h"
-#include "endian.h"
-#include "malloc.h"
-#include "ntfs.h"
-
-/**
- * ntfs_rl_mm - runlist memmove
- *
- * It is up to the caller to serialize access to the runlist @base.
- */
-static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
-		int size)
-{
-	if (likely((dst != src) && (size > 0)))
-		memmove(base + dst, base + src, size * sizeof(*base));
-}
-
-/**
- * ntfs_rl_mc - runlist memory copy
- *
- * It is up to the caller to serialize access to the runlists @dstbase and
- * @srcbase.
- */
-static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
-		runlist_element *srcbase, int src, int size)
-{
-	if (likely(size > 0))
-		memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
-}
-
-/**
- * ntfs_rl_realloc - Reallocate memory for runlists
- * @rl:		original runlist
- * @old_size:	number of runlist elements in the original runlist @rl
- * @new_size:	number of runlist elements we need space for
- *
- * As the runlists grow, more memory will be required.  To prevent the
- * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns an entire page of memory.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * N.B.  If the new allocation doesn't require a different number of pages in
- *       memory, the function will return the original pointer.
- *
- * On success, return a pointer to the newly allocated, or recycled, memory.
- * On error, return -errno. The following error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
-		int old_size, int new_size)
-{
-	runlist_element *new_rl;
-
-	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
-	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
-	if (old_size == new_size)
-		return rl;
-
-	new_rl = ntfs_malloc_nofs(new_size);
-	if (unlikely(!new_rl))
-		return ERR_PTR(-ENOMEM);
-
-	if (likely(rl != NULL)) {
-		if (unlikely(old_size > new_size))
-			old_size = new_size;
-		memcpy(new_rl, rl, old_size);
-		ntfs_free(rl);
-	}
-	return new_rl;
-}
-
-/**
- * ntfs_rl_realloc_nofail - Reallocate memory for runlists
- * @rl:		original runlist
- * @old_size:	number of runlist elements in the original runlist @rl
- * @new_size:	number of runlist elements we need space for
- *
- * As the runlists grow, more memory will be required.  To prevent the
- * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns an entire page of memory.
- *
- * This function guarantees that the allocation will succeed.  It will sleep
- * for as long as it takes to complete the allocation.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * N.B.  If the new allocation doesn't require a different number of pages in
- *       memory, the function will return the original pointer.
- *
- * On success, return a pointer to the newly allocated, or recycled, memory.
- * On error, return -errno. The following error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
-		int old_size, int new_size)
-{
-	runlist_element *new_rl;
-
-	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
-	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
-	if (old_size == new_size)
-		return rl;
-
-	new_rl = ntfs_malloc_nofs_nofail(new_size);
-	BUG_ON(!new_rl);
-
-	if (likely(rl != NULL)) {
-		if (unlikely(old_size > new_size))
-			old_size = new_size;
-		memcpy(new_rl, rl, old_size);
-		ntfs_free(rl);
-	}
-	return new_rl;
-}
-
-/**
- * ntfs_are_rl_mergeable - test if two runlists can be joined together
- * @dst:	original runlist
- * @src:	new runlist to test for mergeability with @dst
- *
- * Test if two runlists can be joined together. For this, their VCNs and LCNs
- * must be adjacent.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * Return: true   Success, the runlists can be merged.
- *	   false  Failure, the runlists cannot be merged.
- */
-static inline bool ntfs_are_rl_mergeable(runlist_element *dst,
-		runlist_element *src)
-{
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* We can merge unmapped regions even if they are misaligned. */
-	if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
-		return true;
-	/* If the runs are misaligned, we cannot merge them. */
-	if ((dst->vcn + dst->length) != src->vcn)
-		return false;
-	/* If both runs are non-sparse and contiguous, we can merge them. */
-	if ((dst->lcn >= 0) && (src->lcn >= 0) &&
-			((dst->lcn + dst->length) == src->lcn))
-		return true;
-	/* If we are merging two holes, we can merge them. */
-	if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
-		return true;
-	/* Cannot merge. */
-	return false;
-}
-
-/**
- * __ntfs_rl_merge - merge two runlists without testing if they can be merged
- * @dst:	original, destination runlist
- * @src:	new runlist to merge with @dst
- *
- * Merge the two runlists, writing into the destination runlist @dst. The
- * caller must make sure the runlists can be merged or this will corrupt the
- * destination runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- */
-static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
-{
-	dst->length += src->length;
-}
-
-/**
- * ntfs_rl_append - append a runlist after a given element
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	runlist to be inserted into @dst
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	append the new runlist @src after this element in @dst
- *
- * Append the runlist @src after element @loc in @dst.  Merge the right end of
- * the new runlist, if necessary. Adjust the size of the hole before the
- * appended runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_append(runlist_element *dst,
-		int dsize, runlist_element *src, int ssize, int loc)
-{
-	bool right = false;	/* Right end of @src needs merging. */
-	int marker;		/* End of the inserted runs. */
-
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* First, check if the right hand end needs merging. */
-	if ((loc + 1) < dsize)
-		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
-
-	/* Space required: @dst size + @src size, less one if we merged. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlists.
-	 */
-
-	/* First, merge the right hand end, if necessary. */
-	if (right)
-		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
-
-	/* First run after the @src runs that have been inserted. */
-	marker = loc + ssize + 1;
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right));
-	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
-	/* Adjust the size of the preceding hole. */
-	dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
-
-	/* We may have changed the length of the file, so fix the end marker */
-	if (dst[marker].lcn == LCN_ENOENT)
-		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
-
-	return dst;
-}
-
-/**
- * ntfs_rl_insert - insert a runlist into another
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	new runlist to be inserted
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	insert the new runlist @src before this element in @dst
- *
- * Insert the runlist @src before element @loc in the runlist @dst. Merge the
- * left end of the new runlist, if necessary. Adjust the size of the hole
- * after the inserted runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
-		int dsize, runlist_element *src, int ssize, int loc)
-{
-	bool left = false;	/* Left end of @src needs merging. */
-	bool disc = false;	/* Discontinuity between @dst and @src. */
-	int marker;		/* End of the inserted runs. */
-
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/*
-	 * disc => Discontinuity between the end of @dst and the start of @src.
-	 *	   This means we might need to insert a "not mapped" run.
-	 */
-	if (loc == 0)
-		disc = (src[0].vcn > 0);
-	else {
-		s64 merged_length;
-
-		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-
-		merged_length = dst[loc - 1].length;
-		if (left)
-			merged_length += src->length;
-
-		disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
-	}
-	/*
-	 * Space required: @dst size + @src size, less one if we merged, plus
-	 * one if there was a discontinuity.
-	 */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlist.
-	 */
-	if (left)
-		__ntfs_rl_merge(dst + loc - 1, src);
-	/*
-	 * First run after the @src runs that have been inserted.
-	 * Nominally,  @marker equals @loc + @ssize, i.e. location + number of
-	 * runs in @src.  However, if @left, then the first run in @src has
-	 * been merged with one in @dst.  And if @disc, then @dst and @src do
-	 * not meet and we need an extra run to fill the gap.
-	 */
-	marker = loc + ssize - left + disc;
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, marker, loc, dsize - loc);
-	ntfs_rl_mc(dst, loc + disc, src, left, ssize - left);
-
-	/* Adjust the VCN of the first run after the insertion... */
-	dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
-	/* ... and the length. */
-	if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED)
-		dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn;
-
-	/* Writing beyond the end of the file and there is a discontinuity. */
-	if (disc) {
-		if (loc > 0) {
-			dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length;
-			dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
-		} else {
-			dst[loc].vcn = 0;
-			dst[loc].length = dst[loc + 1].vcn;
-		}
-		dst[loc].lcn = LCN_RL_NOT_MAPPED;
-	}
-	return dst;
-}
-
-/**
- * ntfs_rl_replace - overwrite a runlist element with another runlist
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	new runlist to be inserted
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	index in runlist @dst to overwrite with @src
- *
- * Replace the runlist element @dst at @loc with @src. Merge the left and
- * right ends of the inserted runlist, if necessary.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
-		int dsize, runlist_element *src, int ssize, int loc)
-{
-	signed delta;
-	bool left = false;	/* Left end of @src needs merging. */
-	bool right = false;	/* Right end of @src needs merging. */
-	int tail;		/* Start of tail of @dst. */
-	int marker;		/* End of the inserted runs. */
-
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* First, see if the left and right ends need merging. */
-	if ((loc + 1) < dsize)
-		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
-	if (loc > 0)
-		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-	/*
-	 * Allocate some space.  We will need less if the left, right, or both
-	 * ends get merged.  The -1 accounts for the run being replaced.
-	 */
-	delta = ssize - 1 - left - right;
-	if (delta > 0) {
-		dst = ntfs_rl_realloc(dst, dsize, dsize + delta);
-		if (IS_ERR(dst))
-			return dst;
-	}
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlists.
-	 */
-
-	/* First, merge the left and right ends, if necessary. */
-	if (right)
-		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
-	if (left)
-		__ntfs_rl_merge(dst + loc - 1, src);
-	/*
-	 * Offset of the tail of @dst.  This needs to be moved out of the way
-	 * to make space for the runs to be copied from @src, i.e. the first
-	 * run of the tail of @dst.
-	 * Nominally, @tail equals @loc + 1, i.e. location, skipping the
-	 * replaced run.  However, if @right, then one of @dst's runs is
-	 * already merged into @src.
-	 */
-	tail = loc + right + 1;
-	/*
-	 * First run after the @src runs that have been inserted, i.e. where
-	 * the tail of @dst needs to be moved to.
-	 * Nominally, @marker equals @loc + @ssize, i.e. location + number of
-	 * runs in @src.  However, if @left, then the first run in @src has
-	 * been merged with one in @dst.
-	 */
-	marker = loc + ssize - left;
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, marker, tail, dsize - tail);
-	ntfs_rl_mc(dst, loc, src, left, ssize - left);
-
-	/* We may have changed the length of the file, so fix the end marker. */
-	if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT)
-		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
-	return dst;
-}
-
-/**
- * ntfs_rl_split - insert a runlist into the centre of a hole
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	new runlist to be inserted
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	index in runlist @dst at which to split and insert @src
- *
- * Split the runlist @dst at @loc into two and insert @new in between the two
- * fragments. No merging of runlists is necessary. Adjust the size of the
- * holes either side.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
-		runlist_element *src, int ssize, int loc)
-{
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* Space required: @dst size + @src size + one new hole. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlists.
-	 */
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
-	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
-	/* Adjust the size of the holes either size of @src. */
-	dst[loc].length		= dst[loc+1].vcn       - dst[loc].vcn;
-	dst[loc+ssize+1].vcn    = dst[loc+ssize].vcn   + dst[loc+ssize].length;
-	dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
-
-	return dst;
-}
-
-/**
- * ntfs_runlists_merge - merge two runlists into one
- * @drl:	original runlist to be worked on
- * @srl:	new runlist to be merged into @drl
- *
- * First we sanity check the two runlists @srl and @drl to make sure that they
- * are sensible and can be merged. The runlist @srl must be either after the
- * runlist @drl or completely within a hole (or unmapped region) in @drl.
- *
- * It is up to the caller to serialize access to the runlists @drl and @srl.
- *
- * Merging of runlists is necessary in two cases:
- *   1. When attribute lists are used and a further extent is being mapped.
- *   2. When new clusters are allocated to fill a hole or extend a file.
- *
- * There are four possible ways @srl can be merged. It can:
- *	- be inserted at the beginning of a hole,
- *	- split the hole in two and be inserted between the two fragments,
- *	- be appended at the end of a hole, or it can
- *	- replace the whole hole.
- * It can also be appended to the end of the runlist, which is just a variant
- * of the insert case.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @drl and @srl are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- *	-ERANGE	- The runlists overlap and cannot be merged.
- */
-runlist_element *ntfs_runlists_merge(runlist_element *drl,
-		runlist_element *srl)
-{
-	int di, si;		/* Current index into @[ds]rl. */
-	int sstart;		/* First index with lcn > LCN_RL_NOT_MAPPED. */
-	int dins;		/* Index into @drl at which to insert @srl. */
-	int dend, send;		/* Last index into @[ds]rl. */
-	int dfinal, sfinal;	/* The last index into @[ds]rl with
-				   lcn >= LCN_HOLE. */
-	int marker = 0;
-	VCN marker_vcn = 0;
-
-#ifdef DEBUG
-	ntfs_debug("dst:");
-	ntfs_debug_dump_runlist(drl);
-	ntfs_debug("src:");
-	ntfs_debug_dump_runlist(srl);
-#endif
-
-	/* Check for silly calling... */
-	if (unlikely(!srl))
-		return drl;
-	if (IS_ERR(srl) || IS_ERR(drl))
-		return ERR_PTR(-EINVAL);
-
-	/* Check for the case where the first mapping is being done now. */
-	if (unlikely(!drl)) {
-		drl = srl;
-		/* Complete the source runlist if necessary. */
-		if (unlikely(drl[0].vcn)) {
-			/* Scan to the end of the source runlist. */
-			for (dend = 0; likely(drl[dend].length); dend++)
-				;
-			dend++;
-			drl = ntfs_rl_realloc(drl, dend, dend + 1);
-			if (IS_ERR(drl))
-				return drl;
-			/* Insert start element at the front of the runlist. */
-			ntfs_rl_mm(drl, 1, 0, dend);
-			drl[0].vcn = 0;
-			drl[0].lcn = LCN_RL_NOT_MAPPED;
-			drl[0].length = drl[1].vcn;
-		}
-		goto finished;
-	}
-
-	si = di = 0;
-
-	/* Skip any unmapped start element(s) in the source runlist. */
-	while (srl[si].length && srl[si].lcn < LCN_HOLE)
-		si++;
-
-	/* Can't have an entirely unmapped source runlist. */
-	BUG_ON(!srl[si].length);
-
-	/* Record the starting points. */
-	sstart = si;
-
-	/*
-	 * Skip forward in @drl until we reach the position where @srl needs to
-	 * be inserted. If we reach the end of @drl, @srl just needs to be
-	 * appended to @drl.
-	 */
-	for (; drl[di].length; di++) {
-		if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
-			break;
-	}
-	dins = di;
-
-	/* Sanity check for illegal overlaps. */
-	if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
-			(srl[si].lcn >= 0)) {
-		ntfs_error(NULL, "Run lists overlap. Cannot merge!");
-		return ERR_PTR(-ERANGE);
-	}
-
-	/* Scan to the end of both runlists in order to know their sizes. */
-	for (send = si; srl[send].length; send++)
-		;
-	for (dend = di; drl[dend].length; dend++)
-		;
-
-	if (srl[send].lcn == LCN_ENOENT)
-		marker_vcn = srl[marker = send].vcn;
-
-	/* Scan to the last element with lcn >= LCN_HOLE. */
-	for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
-		;
-	for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
-		;
-
-	{
-	bool start;
-	bool finish;
-	int ds = dend + 1;		/* Number of elements in drl & srl */
-	int ss = sfinal - sstart + 1;
-
-	start  = ((drl[dins].lcn <  LCN_RL_NOT_MAPPED) ||    /* End of file   */
-		  (drl[dins].vcn == srl[sstart].vcn));	     /* Start of hole */
-	finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) &&    /* End of file   */
-		 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
-		  (srl[send - 1].vcn + srl[send - 1].length)));
-
-	/* Or we will lose an end marker. */
-	if (finish && !drl[dins].length)
-		ss++;
-	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
-		finish = false;
-#if 0
-	ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
-	ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
-	ntfs_debug("start = %i, finish = %i", start, finish);
-	ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
-#endif
-	if (start) {
-		if (finish)
-			drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
-		else
-			drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
-	} else {
-		if (finish)
-			drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
-		else
-			drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
-	}
-	if (IS_ERR(drl)) {
-		ntfs_error(NULL, "Merge failed.");
-		return drl;
-	}
-	ntfs_free(srl);
-	if (marker) {
-		ntfs_debug("Triggering marker code.");
-		for (ds = dend; drl[ds].length; ds++)
-			;
-		/* We only need to care if @srl ended after @drl. */
-		if (drl[ds].vcn <= marker_vcn) {
-			int slots = 0;
-
-			if (drl[ds].vcn == marker_vcn) {
-				ntfs_debug("Old marker = 0x%llx, replacing "
-						"with LCN_ENOENT.",
-						(unsigned long long)
-						drl[ds].lcn);
-				drl[ds].lcn = LCN_ENOENT;
-				goto finished;
-			}
-			/*
-			 * We need to create an unmapped runlist element in
-			 * @drl or extend an existing one before adding the
-			 * ENOENT terminator.
-			 */
-			if (drl[ds].lcn == LCN_ENOENT) {
-				ds--;
-				slots = 1;
-			}
-			if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
-				/* Add an unmapped runlist element. */
-				if (!slots) {
-					drl = ntfs_rl_realloc_nofail(drl, ds,
-							ds + 2);
-					slots = 2;
-				}
-				ds++;
-				/* Need to set vcn if it isn't set already. */
-				if (slots != 1)
-					drl[ds].vcn = drl[ds - 1].vcn +
-							drl[ds - 1].length;
-				drl[ds].lcn = LCN_RL_NOT_MAPPED;
-				/* We now used up a slot. */
-				slots--;
-			}
-			drl[ds].length = marker_vcn - drl[ds].vcn;
-			/* Finally add the ENOENT terminator. */
-			ds++;
-			if (!slots)
-				drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
-			drl[ds].vcn = marker_vcn;
-			drl[ds].lcn = LCN_ENOENT;
-			drl[ds].length = (s64)0;
-		}
-	}
-	}
-
-finished:
-	/* The merge was completed successfully. */
-	ntfs_debug("Merged runlist:");
-	ntfs_debug_dump_runlist(drl);
-	return drl;
-}
-
-/**
- * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
- * @vol:	ntfs volume on which the attribute resides
- * @attr:	attribute record whose mapping pairs array to decompress
- * @old_rl:	optional runlist in which to insert @attr's runlist
- *
- * It is up to the caller to serialize access to the runlist @old_rl.
- *
- * Decompress the attribute @attr's mapping pairs array into a runlist. On
- * success, return the decompressed runlist.
- *
- * If @old_rl is not NULL, decompressed runlist is inserted into the
- * appropriate place in @old_rl and the resultant, combined runlist is
- * returned. The original @old_rl is deallocated.
- *
- * On error, return -errno. @old_rl is left unmodified in that case.
- *
- * The following error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EIO	- Corrupt runlist.
- *	-EINVAL	- Invalid parameters were passed in.
- *	-ERANGE	- The two runlists overlap.
- *
- * FIXME: For now we take the conceptionally simplest approach of creating the
- * new runlist disregarding the already existing one and then splicing the
- * two into one, if that is possible (we check for overlap and discard the new
- * runlist if overlap present before returning ERR_PTR(-ERANGE)).
- */
-runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
-		const ATTR_RECORD *attr, runlist_element *old_rl)
-{
-	VCN vcn;		/* Current vcn. */
-	LCN lcn;		/* Current lcn. */
-	s64 deltaxcn;		/* Change in [vl]cn. */
-	runlist_element *rl;	/* The output runlist. */
-	u8 *buf;		/* Current position in mapping pairs array. */
-	u8 *attr_end;		/* End of attribute. */
-	int rlsize;		/* Size of runlist buffer. */
-	u16 rlpos;		/* Current runlist position in units of
-				   runlist_elements. */
-	u8 b;			/* Current byte offset in buf. */
-
-#ifdef DEBUG
-	/* Make sure attr exists and is non-resident. */
-	if (!attr || !attr->non_resident || sle64_to_cpu(
-			attr->data.non_resident.lowest_vcn) < (VCN)0) {
-		ntfs_error(vol->sb, "Invalid arguments.");
-		return ERR_PTR(-EINVAL);
-	}
-#endif
-	/* Start at vcn = lowest_vcn and lcn 0. */
-	vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
-	lcn = 0;
-	/* Get start of the mapping pairs array. */
-	buf = (u8*)attr + le16_to_cpu(
-			attr->data.non_resident.mapping_pairs_offset);
-	attr_end = (u8*)attr + le32_to_cpu(attr->length);
-	if (unlikely(buf < (u8*)attr || buf > attr_end)) {
-		ntfs_error(vol->sb, "Corrupt attribute.");
-		return ERR_PTR(-EIO);
-	}
-	/* If the mapping pairs array is valid but empty, nothing to do. */
-	if (!vcn && !*buf)
-		return old_rl;
-	/* Current position in runlist array. */
-	rlpos = 0;
-	/* Allocate first page and set current runlist size to one page. */
-	rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
-	if (unlikely(!rl))
-		return ERR_PTR(-ENOMEM);
-	/* Insert unmapped starting element if necessary. */
-	if (vcn) {
-		rl->vcn = 0;
-		rl->lcn = LCN_RL_NOT_MAPPED;
-		rl->length = vcn;
-		rlpos++;
-	}
-	while (buf < attr_end && *buf) {
-		/*
-		 * Allocate more memory if needed, including space for the
-		 * not-mapped and terminator elements. ntfs_malloc_nofs()
-		 * operates on whole pages only.
-		 */
-		if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
-			runlist_element *rl2;
-
-			rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
-			if (unlikely(!rl2)) {
-				ntfs_free(rl);
-				return ERR_PTR(-ENOMEM);
-			}
-			memcpy(rl2, rl, rlsize);
-			ntfs_free(rl);
-			rl = rl2;
-			rlsize += PAGE_SIZE;
-		}
-		/* Enter the current vcn into the current runlist element. */
-		rl[rlpos].vcn = vcn;
-		/*
-		 * Get the change in vcn, i.e. the run length in clusters.
-		 * Doing it this way ensures that we signextend negative values.
-		 * A negative run length doesn't make any sense, but hey, I
-		 * didn't make up the NTFS specs and Windows NT4 treats the run
-		 * length as a signed value so that's how it is...
-		 */
-		b = *buf & 0xf;
-		if (b) {
-			if (unlikely(buf + b > attr_end))
-				goto io_error;
-			for (deltaxcn = (s8)buf[b--]; b; b--)
-				deltaxcn = (deltaxcn << 8) + buf[b];
-		} else { /* The length entry is compulsory. */
-			ntfs_error(vol->sb, "Missing length entry in mapping "
-					"pairs array.");
-			deltaxcn = (s64)-1;
-		}
-		/*
-		 * Assume a negative length to indicate data corruption and
-		 * hence clean-up and return NULL.
-		 */
-		if (unlikely(deltaxcn < 0)) {
-			ntfs_error(vol->sb, "Invalid length in mapping pairs "
-					"array.");
-			goto err_out;
-		}
-		/*
-		 * Enter the current run length into the current runlist
-		 * element.
-		 */
-		rl[rlpos].length = deltaxcn;
-		/* Increment the current vcn by the current run length. */
-		vcn += deltaxcn;
-		/*
-		 * There might be no lcn change at all, as is the case for
-		 * sparse clusters on NTFS 3.0+, in which case we set the lcn
-		 * to LCN_HOLE.
-		 */
-		if (!(*buf & 0xf0))
-			rl[rlpos].lcn = LCN_HOLE;
-		else {
-			/* Get the lcn change which really can be negative. */
-			u8 b2 = *buf & 0xf;
-			b = b2 + ((*buf >> 4) & 0xf);
-			if (buf + b > attr_end)
-				goto io_error;
-			for (deltaxcn = (s8)buf[b--]; b > b2; b--)
-				deltaxcn = (deltaxcn << 8) + buf[b];
-			/* Change the current lcn to its new value. */
-			lcn += deltaxcn;
-#ifdef DEBUG
-			/*
-			 * On NTFS 1.2-, apparently can have lcn == -1 to
-			 * indicate a hole. But we haven't verified ourselves
-			 * whether it is really the lcn or the deltaxcn that is
-			 * -1. So if either is found give us a message so we
-			 * can investigate it further!
-			 */
-			if (vol->major_ver < 3) {
-				if (unlikely(deltaxcn == (LCN)-1))
-					ntfs_error(vol->sb, "lcn delta == -1");
-				if (unlikely(lcn == (LCN)-1))
-					ntfs_error(vol->sb, "lcn == -1");
-			}
-#endif
-			/* Check lcn is not below -1. */
-			if (unlikely(lcn < (LCN)-1)) {
-				ntfs_error(vol->sb, "Invalid LCN < -1 in "
-						"mapping pairs array.");
-				goto err_out;
-			}
-			/* Enter the current lcn into the runlist element. */
-			rl[rlpos].lcn = lcn;
-		}
-		/* Get to the next runlist element. */
-		rlpos++;
-		/* Increment the buffer position to the next mapping pair. */
-		buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
-	}
-	if (unlikely(buf >= attr_end))
-		goto io_error;
-	/*
-	 * If there is a highest_vcn specified, it must be equal to the final
-	 * vcn in the runlist - 1, or something has gone badly wrong.
-	 */
-	deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
-	if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
-mpa_err:
-		ntfs_error(vol->sb, "Corrupt mapping pairs array in "
-				"non-resident attribute.");
-		goto err_out;
-	}
-	/* Setup not mapped runlist element if this is the base extent. */
-	if (!attr->data.non_resident.lowest_vcn) {
-		VCN max_cluster;
-
-		max_cluster = ((sle64_to_cpu(
-				attr->data.non_resident.allocated_size) +
-				vol->cluster_size - 1) >>
-				vol->cluster_size_bits) - 1;
-		/*
-		 * A highest_vcn of zero means this is a single extent
-		 * attribute so simply terminate the runlist with LCN_ENOENT).
-		 */
-		if (deltaxcn) {
-			/*
-			 * If there is a difference between the highest_vcn and
-			 * the highest cluster, the runlist is either corrupt
-			 * or, more likely, there are more extents following
-			 * this one.
-			 */
-			if (deltaxcn < max_cluster) {
-				ntfs_debug("More extents to follow; deltaxcn "
-						"= 0x%llx, max_cluster = "
-						"0x%llx",
-						(unsigned long long)deltaxcn,
-						(unsigned long long)
-						max_cluster);
-				rl[rlpos].vcn = vcn;
-				vcn += rl[rlpos].length = max_cluster -
-						deltaxcn;
-				rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
-				rlpos++;
-			} else if (unlikely(deltaxcn > max_cluster)) {
-				ntfs_error(vol->sb, "Corrupt attribute.  "
-						"deltaxcn = 0x%llx, "
-						"max_cluster = 0x%llx",
-						(unsigned long long)deltaxcn,
-						(unsigned long long)
-						max_cluster);
-				goto mpa_err;
-			}
-		}
-		rl[rlpos].lcn = LCN_ENOENT;
-	} else /* Not the base extent. There may be more extents to follow. */
-		rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
-
-	/* Setup terminating runlist element. */
-	rl[rlpos].vcn = vcn;
-	rl[rlpos].length = (s64)0;
-	/* If no existing runlist was specified, we are done. */
-	if (!old_rl) {
-		ntfs_debug("Mapping pairs array successfully decompressed:");
-		ntfs_debug_dump_runlist(rl);
-		return rl;
-	}
-	/* Now combine the new and old runlists checking for overlaps. */
-	old_rl = ntfs_runlists_merge(old_rl, rl);
-	if (!IS_ERR(old_rl))
-		return old_rl;
-	ntfs_free(rl);
-	ntfs_error(vol->sb, "Failed to merge runlists.");
-	return old_rl;
-io_error:
-	ntfs_error(vol->sb, "Corrupt attribute.");
-err_out:
-	ntfs_free(rl);
-	return ERR_PTR(-EIO);
-}
-
-/**
- * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
- * @rl:		runlist to use for conversion
- * @vcn:	vcn to convert
- *
- * Convert the virtual cluster number @vcn of an attribute into a logical
- * cluster number (lcn) of a device using the runlist @rl to map vcns to their
- * corresponding lcns.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * Since lcns must be >= 0, we use negative return codes with special meaning:
- *
- * Return code		Meaning / Description
- * ==================================================
- *  LCN_HOLE		Hole / not allocated on disk.
- *  LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
- *			inserted into the runlist yet.
- *  LCN_ENOENT		There is no such vcn in the attribute.
- *
- * Locking: - The caller must have locked the runlist (for reading or writing).
- *	    - This function does not touch the lock, nor does it modify the
- *	      runlist.
- */
-LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
-{
-	int i;
-
-	BUG_ON(vcn < 0);
-	/*
-	 * If rl is NULL, assume that we have found an unmapped runlist. The
-	 * caller can then attempt to map it and fail appropriately if
-	 * necessary.
-	 */
-	if (unlikely(!rl))
-		return LCN_RL_NOT_MAPPED;
-
-	/* Catch out of lower bounds vcn. */
-	if (unlikely(vcn < rl[0].vcn))
-		return LCN_ENOENT;
-
-	for (i = 0; likely(rl[i].length); i++) {
-		if (unlikely(vcn < rl[i+1].vcn)) {
-			if (likely(rl[i].lcn >= (LCN)0))
-				return rl[i].lcn + (vcn - rl[i].vcn);
-			return rl[i].lcn;
-		}
-	}
-	/*
-	 * The terminator element is setup to the correct value, i.e. one of
-	 * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
-	 */
-	if (likely(rl[i].lcn < (LCN)0))
-		return rl[i].lcn;
-	/* Just in case... We could replace this with BUG() some day. */
-	return LCN_ENOENT;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_rl_find_vcn_nolock - find a vcn in a runlist
- * @rl:		runlist to search
- * @vcn:	vcn to find
- *
- * Find the virtual cluster number @vcn in the runlist @rl and return the
- * address of the runlist element containing the @vcn on success.
- *
- * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of
- * the runlist.
- *
- * Locking: The runlist must be locked on entry.
- */
-runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn)
-{
-	BUG_ON(vcn < 0);
-	if (unlikely(!rl || vcn < rl[0].vcn))
-		return NULL;
-	while (likely(rl->length)) {
-		if (unlikely(vcn < rl[1].vcn)) {
-			if (likely(rl->lcn >= LCN_HOLE))
-				return rl;
-			return NULL;
-		}
-		rl++;
-	}
-	if (likely(rl->lcn == LCN_ENOENT))
-		return rl;
-	return NULL;
-}
-
-/**
- * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
- * @n:		number for which to get the number of bytes for
- *
- * Return the number of bytes required to store @n unambiguously as
- * a signed number.
- *
- * This is used in the context of the mapping pairs array to determine how
- * many bytes will be needed in the array to store a given logical cluster
- * number (lcn) or a specific run length.
- *
- * Return the number of bytes written.  This function cannot fail.
- */
-static inline int ntfs_get_nr_significant_bytes(const s64 n)
-{
-	s64 l = n;
-	int i;
-	s8 j;
-
-	i = 0;
-	do {
-		l >>= 8;
-		i++;
-	} while (l != 0 && l != -1);
-	j = (n >> 8 * (i - 1)) & 0xff;
-	/* If the sign bit is wrong, we need an extra byte. */
-	if ((n < 0 && j >= 0) || (n > 0 && j < 0))
-		i++;
-	return i;
-}
-
-/**
- * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
- * @vol:	ntfs volume (needed for the ntfs version)
- * @rl:		locked runlist to determine the size of the mapping pairs of
- * @first_vcn:	first vcn which to include in the mapping pairs array
- * @last_vcn:	last vcn which to include in the mapping pairs array
- *
- * Walk the locked runlist @rl and calculate the size in bytes of the mapping
- * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and
- * finishing with vcn @last_vcn.
- *
- * A @last_vcn of -1 means end of runlist and in that case the size of the
- * mapping pairs array corresponding to the runlist starting at vcn @first_vcn
- * and finishing at the end of the runlist is determined.
- *
- * This for example allows us to allocate a buffer of the right size when
- * building the mapping pairs array.
- *
- * If @rl is NULL, just return 1 (for the single terminator byte).
- *
- * Return the calculated size in bytes on success.  On error, return -errno.
- * The following error codes are defined:
- *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
- *		  fully mapped runlists to this function.
- *	-EIO	- The runlist is corrupt.
- *
- * Locking: @rl must be locked on entry (either for reading or writing), it
- *	    remains locked throughout, and is left locked upon return.
- */
-int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
-		const runlist_element *rl, const VCN first_vcn,
-		const VCN last_vcn)
-{
-	LCN prev_lcn;
-	int rls;
-	bool the_end = false;
-
-	BUG_ON(first_vcn < 0);
-	BUG_ON(last_vcn < -1);
-	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
-	if (!rl) {
-		BUG_ON(first_vcn);
-		BUG_ON(last_vcn > 0);
-		return 1;
-	}
-	/* Skip to runlist element containing @first_vcn. */
-	while (rl->length && first_vcn >= rl[1].vcn)
-		rl++;
-	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
-			first_vcn < rl->vcn))
-		return -EINVAL;
-	prev_lcn = 0;
-	/* Always need the termining zero byte. */
-	rls = 1;
-	/* Do the first partial run if present. */
-	if (first_vcn > rl->vcn) {
-		s64 delta, length = rl->length;
-
-		/* We know rl->length != 0 already. */
-		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
-			goto err_out;
-		/*
-		 * If @stop_vcn is given and finishes inside this run, cap the
-		 * run length.
-		 */
-		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
-			s64 s1 = last_vcn + 1;
-			if (unlikely(rl[1].vcn > s1))
-				length = s1 - rl->vcn;
-			the_end = true;
-		}
-		delta = first_vcn - rl->vcn;
-		/* Header byte + length. */
-		rls += 1 + ntfs_get_nr_significant_bytes(length - delta);
-		/*
-		 * If the logical cluster number (lcn) denotes a hole and we
-		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
-		 * zero space.  On earlier NTFS versions we just store the lcn.
-		 * Note: this assumes that on NTFS 1.2-, holes are stored with
-		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
-		 */
-		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
-			prev_lcn = rl->lcn;
-			if (likely(rl->lcn >= 0))
-				prev_lcn += delta;
-			/* Change in lcn. */
-			rls += ntfs_get_nr_significant_bytes(prev_lcn);
-		}
-		/* Go to next runlist element. */
-		rl++;
-	}
-	/* Do the full runs. */
-	for (; rl->length && !the_end; rl++) {
-		s64 length = rl->length;
-
-		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
-			goto err_out;
-		/*
-		 * If @stop_vcn is given and finishes inside this run, cap the
-		 * run length.
-		 */
-		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
-			s64 s1 = last_vcn + 1;
-			if (unlikely(rl[1].vcn > s1))
-				length = s1 - rl->vcn;
-			the_end = true;
-		}
-		/* Header byte + length. */
-		rls += 1 + ntfs_get_nr_significant_bytes(length);
-		/*
-		 * If the logical cluster number (lcn) denotes a hole and we
-		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
-		 * zero space.  On earlier NTFS versions we just store the lcn.
-		 * Note: this assumes that on NTFS 1.2-, holes are stored with
-		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
-		 */
-		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
-			/* Change in lcn. */
-			rls += ntfs_get_nr_significant_bytes(rl->lcn -
-					prev_lcn);
-			prev_lcn = rl->lcn;
-		}
-	}
-	return rls;
-err_out:
-	if (rl->lcn == LCN_RL_NOT_MAPPED)
-		rls = -EINVAL;
-	else
-		rls = -EIO;
-	return rls;
-}
-
-/**
- * ntfs_write_significant_bytes - write the significant bytes of a number
- * @dst:	destination buffer to write to
- * @dst_max:	pointer to last byte of destination buffer for bounds checking
- * @n:		number whose significant bytes to write
- *
- * Store in @dst, the minimum bytes of the number @n which are required to
- * identify @n unambiguously as a signed number, taking care not to exceed
- * @dest_max, the maximum position within @dst to which we are allowed to
- * write.
- *
- * This is used when building the mapping pairs array of a runlist to compress
- * a given logical cluster number (lcn) or a specific run length to the minimum
- * size possible.
- *
- * Return the number of bytes written on success.  On error, i.e. the
- * destination buffer @dst is too small, return -ENOSPC.
- */
-static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
-		const s64 n)
-{
-	s64 l = n;
-	int i;
-	s8 j;
-
-	i = 0;
-	do {
-		if (unlikely(dst > dst_max))
-			goto err_out;
-		*dst++ = l & 0xffll;
-		l >>= 8;
-		i++;
-	} while (l != 0 && l != -1);
-	j = (n >> 8 * (i - 1)) & 0xff;
-	/* If the sign bit is wrong, we need an extra byte. */
-	if (n < 0 && j >= 0) {
-		if (unlikely(dst > dst_max))
-			goto err_out;
-		i++;
-		*dst = (s8)-1;
-	} else if (n > 0 && j < 0) {
-		if (unlikely(dst > dst_max))
-			goto err_out;
-		i++;
-		*dst = (s8)0;
-	}
-	return i;
-err_out:
-	return -ENOSPC;
-}
-
-/**
- * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
- * @vol:	ntfs volume (needed for the ntfs version)
- * @dst:	destination buffer to which to write the mapping pairs array
- * @dst_len:	size of destination buffer @dst in bytes
- * @rl:		locked runlist for which to build the mapping pairs array
- * @first_vcn:	first vcn which to include in the mapping pairs array
- * @last_vcn:	last vcn which to include in the mapping pairs array
- * @stop_vcn:	first vcn outside destination buffer on success or -ENOSPC
- *
- * Create the mapping pairs array from the locked runlist @rl, starting at vcn
- * @first_vcn and finishing with vcn @last_vcn and save the array in @dst.
- * @dst_len is the size of @dst in bytes and it should be at least equal to the
- * value obtained by calling ntfs_get_size_for_mapping_pairs().
- *
- * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
- * array corresponding to the runlist starting at vcn @first_vcn and finishing
- * at the end of the runlist is created.
- *
- * If @rl is NULL, just write a single terminator byte to @dst.
- *
- * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
- * the first vcn outside the destination buffer.  Note that on error, @dst has
- * been filled with all the mapping pairs that will fit, thus it can be treated
- * as partial success, in that a new attribute extent needs to be created or
- * the next extent has to be used and the mapping pairs build has to be
- * continued with @first_vcn set to *@stop_vcn.
- *
- * Return 0 on success and -errno on error.  The following error codes are
- * defined:
- *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
- *		  fully mapped runlists to this function.
- *	-EIO	- The runlist is corrupt.
- *	-ENOSPC	- The destination buffer is too small.
- *
- * Locking: @rl must be locked on entry (either for reading or writing), it
- *	    remains locked throughout, and is left locked upon return.
- */
-int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
-		const int dst_len, const runlist_element *rl,
-		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn)
-{
-	LCN prev_lcn;
-	s8 *dst_max, *dst_next;
-	int err = -ENOSPC;
-	bool the_end = false;
-	s8 len_len, lcn_len;
-
-	BUG_ON(first_vcn < 0);
-	BUG_ON(last_vcn < -1);
-	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
-	BUG_ON(dst_len < 1);
-	if (!rl) {
-		BUG_ON(first_vcn);
-		BUG_ON(last_vcn > 0);
-		if (stop_vcn)
-			*stop_vcn = 0;
-		/* Terminator byte. */
-		*dst = 0;
-		return 0;
-	}
-	/* Skip to runlist element containing @first_vcn. */
-	while (rl->length && first_vcn >= rl[1].vcn)
-		rl++;
-	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
-			first_vcn < rl->vcn))
-		return -EINVAL;
-	/*
-	 * @dst_max is used for bounds checking in
-	 * ntfs_write_significant_bytes().
-	 */
-	dst_max = dst + dst_len - 1;
-	prev_lcn = 0;
-	/* Do the first partial run if present. */
-	if (first_vcn > rl->vcn) {
-		s64 delta, length = rl->length;
-
-		/* We know rl->length != 0 already. */
-		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
-			goto err_out;
-		/*
-		 * If @stop_vcn is given and finishes inside this run, cap the
-		 * run length.
-		 */
-		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
-			s64 s1 = last_vcn + 1;
-			if (unlikely(rl[1].vcn > s1))
-				length = s1 - rl->vcn;
-			the_end = true;
-		}
-		delta = first_vcn - rl->vcn;
-		/* Write length. */
-		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
-				length - delta);
-		if (unlikely(len_len < 0))
-			goto size_err;
-		/*
-		 * If the logical cluster number (lcn) denotes a hole and we
-		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
-		 * zero space.  On earlier NTFS versions we just write the lcn
-		 * change.  FIXME: Do we need to write the lcn change or just
-		 * the lcn in that case?  Not sure as I have never seen this
-		 * case on NT4. - We assume that we just need to write the lcn
-		 * change until someone tells us otherwise... (AIA)
-		 */
-		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
-			prev_lcn = rl->lcn;
-			if (likely(rl->lcn >= 0))
-				prev_lcn += delta;
-			/* Write change in lcn. */
-			lcn_len = ntfs_write_significant_bytes(dst + 1 +
-					len_len, dst_max, prev_lcn);
-			if (unlikely(lcn_len < 0))
-				goto size_err;
-		} else
-			lcn_len = 0;
-		dst_next = dst + len_len + lcn_len + 1;
-		if (unlikely(dst_next > dst_max))
-			goto size_err;
-		/* Update header byte. */
-		*dst = lcn_len << 4 | len_len;
-		/* Position at next mapping pairs array element. */
-		dst = dst_next;
-		/* Go to next runlist element. */
-		rl++;
-	}
-	/* Do the full runs. */
-	for (; rl->length && !the_end; rl++) {
-		s64 length = rl->length;
-
-		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
-			goto err_out;
-		/*
-		 * If @stop_vcn is given and finishes inside this run, cap the
-		 * run length.
-		 */
-		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
-			s64 s1 = last_vcn + 1;
-			if (unlikely(rl[1].vcn > s1))
-				length = s1 - rl->vcn;
-			the_end = true;
-		}
-		/* Write length. */
-		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
-				length);
-		if (unlikely(len_len < 0))
-			goto size_err;
-		/*
-		 * If the logical cluster number (lcn) denotes a hole and we
-		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
-		 * zero space.  On earlier NTFS versions we just write the lcn
-		 * change.  FIXME: Do we need to write the lcn change or just
-		 * the lcn in that case?  Not sure as I have never seen this
-		 * case on NT4. - We assume that we just need to write the lcn
-		 * change until someone tells us otherwise... (AIA)
-		 */
-		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
-			/* Write change in lcn. */
-			lcn_len = ntfs_write_significant_bytes(dst + 1 +
-					len_len, dst_max, rl->lcn - prev_lcn);
-			if (unlikely(lcn_len < 0))
-				goto size_err;
-			prev_lcn = rl->lcn;
-		} else
-			lcn_len = 0;
-		dst_next = dst + len_len + lcn_len + 1;
-		if (unlikely(dst_next > dst_max))
-			goto size_err;
-		/* Update header byte. */
-		*dst = lcn_len << 4 | len_len;
-		/* Position at next mapping pairs array element. */
-		dst = dst_next;
-	}
-	/* Success. */
-	err = 0;
-size_err:
-	/* Set stop vcn. */
-	if (stop_vcn)
-		*stop_vcn = rl->vcn;
-	/* Add terminator byte. */
-	*dst = 0;
-	return err;
-err_out:
-	if (rl->lcn == LCN_RL_NOT_MAPPED)
-		err = -EINVAL;
-	else
-		err = -EIO;
-	return err;
-}
-
-/**
- * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
- * @vol:	ntfs volume (needed for error output)
- * @runlist:	runlist to truncate
- * @new_length:	the new length of the runlist in VCNs
- *
- * Truncate the runlist described by @runlist as well as the memory buffer
- * holding the runlist elements to a length of @new_length VCNs.
- *
- * If @new_length lies within the runlist, the runlist elements with VCNs of
- * @new_length and above are discarded.  As a special case if @new_length is
- * zero, the runlist is discarded and set to NULL.
- *
- * If @new_length lies beyond the runlist, a sparse runlist element is added to
- * the end of the runlist @runlist or if the last runlist element is a sparse
- * one already, this is extended.
- *
- * Note, no checking is done for unmapped runlist elements.  It is assumed that
- * the caller has mapped any elements that need to be mapped already.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: The caller must hold @runlist->lock for writing.
- */
-int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
-		const s64 new_length)
-{
-	runlist_element *rl;
-	int old_size;
-
-	ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
-	BUG_ON(!runlist);
-	BUG_ON(new_length < 0);
-	rl = runlist->rl;
-	if (!new_length) {
-		ntfs_debug("Freeing runlist.");
-		runlist->rl = NULL;
-		if (rl)
-			ntfs_free(rl);
-		return 0;
-	}
-	if (unlikely(!rl)) {
-		/*
-		 * Create a runlist consisting of a sparse runlist element of
-		 * length @new_length followed by a terminator runlist element.
-		 */
-		rl = ntfs_malloc_nofs(PAGE_SIZE);
-		if (unlikely(!rl)) {
-			ntfs_error(vol->sb, "Not enough memory to allocate "
-					"runlist element buffer.");
-			return -ENOMEM;
-		}
-		runlist->rl = rl;
-		rl[1].length = rl->vcn = 0;
-		rl->lcn = LCN_HOLE;
-		rl[1].vcn = rl->length = new_length;
-		rl[1].lcn = LCN_ENOENT;
-		return 0;
-	}
-	BUG_ON(new_length < rl->vcn);
-	/* Find @new_length in the runlist. */
-	while (likely(rl->length && new_length >= rl[1].vcn))
-		rl++;
-	/*
-	 * If not at the end of the runlist we need to shrink it.
-	 * If at the end of the runlist we need to expand it.
-	 */
-	if (rl->length) {
-		runlist_element *trl;
-		bool is_end;
-
-		ntfs_debug("Shrinking runlist.");
-		/* Determine the runlist size. */
-		trl = rl + 1;
-		while (likely(trl->length))
-			trl++;
-		old_size = trl - runlist->rl + 1;
-		/* Truncate the run. */
-		rl->length = new_length - rl->vcn;
-		/*
-		 * If a run was partially truncated, make the following runlist
-		 * element a terminator.
-		 */
-		is_end = false;
-		if (rl->length) {
-			rl++;
-			if (!rl->length)
-				is_end = true;
-			rl->vcn = new_length;
-			rl->length = 0;
-		}
-		rl->lcn = LCN_ENOENT;
-		/* Reallocate memory if necessary. */
-		if (!is_end) {
-			int new_size = rl - runlist->rl + 1;
-			rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
-			if (IS_ERR(rl))
-				ntfs_warning(vol->sb, "Failed to shrink "
-						"runlist buffer.  This just "
-						"wastes a bit of memory "
-						"temporarily so we ignore it "
-						"and return success.");
-			else
-				runlist->rl = rl;
-		}
-	} else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
-		ntfs_debug("Expanding runlist.");
-		/*
-		 * If there is a previous runlist element and it is a sparse
-		 * one, extend it.  Otherwise need to add a new, sparse runlist
-		 * element.
-		 */
-		if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
-			(rl - 1)->length = new_length - (rl - 1)->vcn;
-		else {
-			/* Determine the runlist size. */
-			old_size = rl - runlist->rl + 1;
-			/* Reallocate memory if necessary. */
-			rl = ntfs_rl_realloc(runlist->rl, old_size,
-					old_size + 1);
-			if (IS_ERR(rl)) {
-				ntfs_error(vol->sb, "Failed to expand runlist "
-						"buffer, aborting.");
-				return PTR_ERR(rl);
-			}
-			runlist->rl = rl;
-			/*
-			 * Set @rl to the same runlist element in the new
-			 * runlist as before in the old runlist.
-			 */
-			rl += old_size - 1;
-			/* Add a new, sparse runlist element. */
-			rl->lcn = LCN_HOLE;
-			rl->length = new_length - rl->vcn;
-			/* Add a new terminator runlist element. */
-			rl++;
-			rl->length = 0;
-		}
-		rl->vcn = new_length;
-		rl->lcn = LCN_ENOENT;
-	} else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
-		/* Runlist already has same size as requested. */
-		rl->lcn = LCN_ENOENT;
-	}
-	ntfs_debug("Done.");
-	return 0;
-}
-
-/**
- * ntfs_rl_punch_nolock - punch a hole into a runlist
- * @vol:	ntfs volume (needed for error output)
- * @runlist:	runlist to punch a hole into
- * @start:	starting VCN of the hole to be created
- * @length:	size of the hole to be created in units of clusters
- *
- * Punch a hole into the runlist @runlist starting at VCN @start and of size
- * @length clusters.
- *
- * Return 0 on success and -errno on error, in which case @runlist has not been
- * modified.
- *
- * If @start and/or @start + @length are outside the runlist return error code
- * -ENOENT.
- *
- * If the runlist contains unmapped or error elements between @start and @start
- * + @length return error code -EINVAL.
- *
- * Locking: The caller must hold @runlist->lock for writing.
- */
-int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
-		const VCN start, const s64 length)
-{
-	const VCN end = start + length;
-	s64 delta;
-	runlist_element *rl, *rl_end, *rl_real_end, *trl;
-	int old_size;
-	bool lcn_fixup = false;
-
-	ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
-			(long long)start, (long long)length);
-	BUG_ON(!runlist);
-	BUG_ON(start < 0);
-	BUG_ON(length < 0);
-	BUG_ON(end < 0);
-	rl = runlist->rl;
-	if (unlikely(!rl)) {
-		if (likely(!start && !length))
-			return 0;
-		return -ENOENT;
-	}
-	/* Find @start in the runlist. */
-	while (likely(rl->length && start >= rl[1].vcn))
-		rl++;
-	rl_end = rl;
-	/* Find @end in the runlist. */
-	while (likely(rl_end->length && end >= rl_end[1].vcn)) {
-		/* Verify there are no unmapped or error elements. */
-		if (unlikely(rl_end->lcn < LCN_HOLE))
-			return -EINVAL;
-		rl_end++;
-	}
-	/* Check the last element. */
-	if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
-		return -EINVAL;
-	/* This covers @start being out of bounds, too. */
-	if (!rl_end->length && end > rl_end->vcn)
-		return -ENOENT;
-	if (!length)
-		return 0;
-	if (!rl->length)
-		return -ENOENT;
-	rl_real_end = rl_end;
-	/* Determine the runlist size. */
-	while (likely(rl_real_end->length))
-		rl_real_end++;
-	old_size = rl_real_end - runlist->rl + 1;
-	/* If @start is in a hole simply extend the hole. */
-	if (rl->lcn == LCN_HOLE) {
-		/*
-		 * If both @start and @end are in the same sparse run, we are
-		 * done.
-		 */
-		if (end <= rl[1].vcn) {
-			ntfs_debug("Done (requested hole is already sparse).");
-			return 0;
-		}
-extend_hole:
-		/* Extend the hole. */
-		rl->length = end - rl->vcn;
-		/* If @end is in a hole, merge it with the current one. */
-		if (rl_end->lcn == LCN_HOLE) {
-			rl_end++;
-			rl->length = rl_end->vcn - rl->vcn;
-		}
-		/* We have done the hole.  Now deal with the remaining tail. */
-		rl++;
-		/* Cut out all runlist elements up to @end. */
-		if (rl < rl_end)
-			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
-					sizeof(*rl));
-		/* Adjust the beginning of the tail if necessary. */
-		if (end > rl->vcn) {
-			delta = end - rl->vcn;
-			rl->vcn = end;
-			rl->length -= delta;
-			/* Only adjust the lcn if it is real. */
-			if (rl->lcn >= 0)
-				rl->lcn += delta;
-		}
-shrink_allocation:
-		/* Reallocate memory if the allocation changed. */
-		if (rl < rl_end) {
-			rl = ntfs_rl_realloc(runlist->rl, old_size,
-					old_size - (rl_end - rl));
-			if (IS_ERR(rl))
-				ntfs_warning(vol->sb, "Failed to shrink "
-						"runlist buffer.  This just "
-						"wastes a bit of memory "
-						"temporarily so we ignore it "
-						"and return success.");
-			else
-				runlist->rl = rl;
-		}
-		ntfs_debug("Done (extend hole).");
-		return 0;
-	}
-	/*
-	 * If @start is at the beginning of a run things are easier as there is
-	 * no need to split the first run.
-	 */
-	if (start == rl->vcn) {
-		/*
-		 * @start is at the beginning of a run.
-		 *
-		 * If the previous run is sparse, extend its hole.
-		 *
-		 * If @end is not in the same run, switch the run to be sparse
-		 * and extend the newly created hole.
-		 *
-		 * Thus both of these cases reduce the problem to the above
-		 * case of "@start is in a hole".
-		 */
-		if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
-			rl--;
-			goto extend_hole;
-		}
-		if (end >= rl[1].vcn) {
-			rl->lcn = LCN_HOLE;
-			goto extend_hole;
-		}
-		/*
-		 * The final case is when @end is in the same run as @start.
-		 * For this need to split the run into two.  One run for the
-		 * sparse region between the beginning of the old run, i.e.
-		 * @start, and @end and one for the remaining non-sparse
-		 * region, i.e. between @end and the end of the old run.
-		 */
-		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
-		if (IS_ERR(trl))
-			goto enomem_out;
-		old_size++;
-		if (runlist->rl != trl) {
-			rl = trl + (rl - runlist->rl);
-			rl_end = trl + (rl_end - runlist->rl);
-			rl_real_end = trl + (rl_real_end - runlist->rl);
-			runlist->rl = trl;
-		}
-split_end:
-		/* Shift all the runs up by one. */
-		memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
-		/* Finally, setup the two split runs. */
-		rl->lcn = LCN_HOLE;
-		rl->length = length;
-		rl++;
-		rl->vcn += length;
-		/* Only adjust the lcn if it is real. */
-		if (rl->lcn >= 0 || lcn_fixup)
-			rl->lcn += length;
-		rl->length -= length;
-		ntfs_debug("Done (split one).");
-		return 0;
-	}
-	/*
-	 * @start is neither in a hole nor at the beginning of a run.
-	 *
-	 * If @end is in a hole, things are easier as simply truncating the run
-	 * @start is in to end at @start - 1, deleting all runs after that up
-	 * to @end, and finally extending the beginning of the run @end is in
-	 * to be @start is all that is needed.
-	 */
-	if (rl_end->lcn == LCN_HOLE) {
-		/* Truncate the run containing @start. */
-		rl->length = start - rl->vcn;
-		rl++;
-		/* Cut out all runlist elements up to @end. */
-		if (rl < rl_end)
-			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
-					sizeof(*rl));
-		/* Extend the beginning of the run @end is in to be @start. */
-		rl->vcn = start;
-		rl->length = rl[1].vcn - start;
-		goto shrink_allocation;
-	}
-	/* 
-	 * If @end is not in a hole there are still two cases to distinguish.
-	 * Either @end is or is not in the same run as @start.
-	 *
-	 * The second case is easier as it can be reduced to an already solved
-	 * problem by truncating the run @start is in to end at @start - 1.
-	 * Then, if @end is in the next run need to split the run into a sparse
-	 * run followed by a non-sparse run (already covered above) and if @end
-	 * is not in the next run switching it to be sparse, again reduces the
-	 * problem to the already covered case of "@start is in a hole".
-	 */
-	if (end >= rl[1].vcn) {
-		/*
-		 * If @end is not in the next run, reduce the problem to the
-		 * case of "@start is in a hole".
-		 */
-		if (rl[1].length && end >= rl[2].vcn) {
-			/* Truncate the run containing @start. */
-			rl->length = start - rl->vcn;
-			rl++;
-			rl->vcn = start;
-			rl->lcn = LCN_HOLE;
-			goto extend_hole;
-		}
-		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
-		if (IS_ERR(trl))
-			goto enomem_out;
-		old_size++;
-		if (runlist->rl != trl) {
-			rl = trl + (rl - runlist->rl);
-			rl_end = trl + (rl_end - runlist->rl);
-			rl_real_end = trl + (rl_real_end - runlist->rl);
-			runlist->rl = trl;
-		}
-		/* Truncate the run containing @start. */
-		rl->length = start - rl->vcn;
-		rl++;
-		/*
-		 * @end is in the next run, reduce the problem to the case
-		 * where "@start is at the beginning of a run and @end is in
-		 * the same run as @start".
-		 */
-		delta = rl->vcn - start;
-		rl->vcn = start;
-		if (rl->lcn >= 0) {
-			rl->lcn -= delta;
-			/* Need this in case the lcn just became negative. */
-			lcn_fixup = true;
-		}
-		rl->length += delta;
-		goto split_end;
-	}
-	/*
-	 * The first case from above, i.e. @end is in the same run as @start.
-	 * We need to split the run into three.  One run for the non-sparse
-	 * region between the beginning of the old run and @start, one for the
-	 * sparse region between @start and @end, and one for the remaining
-	 * non-sparse region, i.e. between @end and the end of the old run.
-	 */
-	trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
-	if (IS_ERR(trl))
-		goto enomem_out;
-	old_size += 2;
-	if (runlist->rl != trl) {
-		rl = trl + (rl - runlist->rl);
-		rl_end = trl + (rl_end - runlist->rl);
-		rl_real_end = trl + (rl_real_end - runlist->rl);
-		runlist->rl = trl;
-	}
-	/* Shift all the runs up by two. */
-	memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
-	/* Finally, setup the three split runs. */
-	rl->length = start - rl->vcn;
-	rl++;
-	rl->vcn = start;
-	rl->lcn = LCN_HOLE;
-	rl->length = length;
-	rl++;
-	delta = end - rl->vcn;
-	rl->vcn = end;
-	rl->lcn += delta;
-	rl->length -= delta;
-	ntfs_debug("Done (split both).");
-	return 0;
-enomem_out:
-	ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
-	return -ENOMEM;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
deleted file mode 100644
index 38de0a375f59..000000000000
--- a/fs/ntfs/runlist.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
- *	       Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_RUNLIST_H
-#define _LINUX_NTFS_RUNLIST_H
-
-#include "types.h"
-#include "layout.h"
-#include "volume.h"
-
-/**
- * runlist_element - in memory vcn to lcn mapping array element
- * @vcn:	starting vcn of the current array element
- * @lcn:	starting lcn of the current array element
- * @length:	length in clusters of the current array element
- *
- * The last vcn (in fact the last vcn + 1) is reached when length == 0.
- *
- * When lcn == -1 this means that the count vcns starting at vcn are not
- * physically allocated (i.e. this is a hole / data is sparse).
- */
-typedef struct {	/* In memory vcn to lcn mapping structure element. */
-	VCN vcn;	/* vcn = Starting virtual cluster number. */
-	LCN lcn;	/* lcn = Starting logical cluster number. */
-	s64 length;	/* Run length in clusters. */
-} runlist_element;
-
-/**
- * runlist - in memory vcn to lcn mapping array including a read/write lock
- * @rl:		pointer to an array of runlist elements
- * @lock:	read/write spinlock for serializing access to @rl
- *
- */
-typedef struct {
-	runlist_element *rl;
-	struct rw_semaphore lock;
-} runlist;
-
-static inline void ntfs_init_runlist(runlist *rl)
-{
-	rl->rl = NULL;
-	init_rwsem(&rl->lock);
-}
-
-typedef enum {
-	LCN_HOLE		= -1,	/* Keep this as highest value or die! */
-	LCN_RL_NOT_MAPPED	= -2,
-	LCN_ENOENT		= -3,
-	LCN_ENOMEM		= -4,
-	LCN_EIO			= -5,
-} LCN_SPECIAL_VALUES;
-
-extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
-		runlist_element *srl);
-
-extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
-		const ATTR_RECORD *attr, runlist_element *old_rl);
-
-extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
-
-#ifdef NTFS_RW
-
-extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl,
-		const VCN vcn);
-
-extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
-		const runlist_element *rl, const VCN first_vcn,
-		const VCN last_vcn);
-
-extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
-		const int dst_len, const runlist_element *rl,
-		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn);
-
-extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
-		runlist *const runlist, const s64 new_length);
-
-int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
-		const VCN start, const s64 length);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
deleted file mode 100644
index 56a7d5bd33e4..000000000000
--- a/fs/ntfs/super.c
+++ /dev/null
@@ -1,3202 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2001,2002 Richard Russon
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <linux/moduleparam.h>
-#include <linux/bitmap.h>
-
-#include "sysctl.h"
-#include "logfile.h"
-#include "quota.h"
-#include "usnjrnl.h"
-#include "dir.h"
-#include "debug.h"
-#include "index.h"
-#include "inode.h"
-#include "aops.h"
-#include "layout.h"
-#include "malloc.h"
-#include "ntfs.h"
-
-/* Number of mounted filesystems which have compression enabled. */
-static unsigned long ntfs_nr_compression_users;
-
-/* A global default upcase table and a corresponding reference count. */
-static ntfschar *default_upcase;
-static unsigned long ntfs_nr_upcase_users;
-
-/* Error constants/strings used in inode.c::ntfs_show_options(). */
-typedef enum {
-	/* One of these must be present, default is ON_ERRORS_CONTINUE. */
-	ON_ERRORS_PANIC			= 0x01,
-	ON_ERRORS_REMOUNT_RO		= 0x02,
-	ON_ERRORS_CONTINUE		= 0x04,
-	/* Optional, can be combined with any of the above. */
-	ON_ERRORS_RECOVER		= 0x10,
-} ON_ERRORS_ACTIONS;
-
-const option_t on_errors_arr[] = {
-	{ ON_ERRORS_PANIC,	"panic" },
-	{ ON_ERRORS_REMOUNT_RO,	"remount-ro", },
-	{ ON_ERRORS_CONTINUE,	"continue", },
-	{ ON_ERRORS_RECOVER,	"recover" },
-	{ 0,			NULL }
-};
-
-/**
- * simple_getbool - convert input string to a boolean value
- * @s: input string to convert
- * @setval: where to store the output boolean value
- *
- * Copied from old ntfs driver (which copied from vfat driver).
- *
- * "1", "yes", "true", or an empty string are converted to %true.
- * "0", "no", and "false" are converted to %false.
- *
- * Return: %1 if the string is converted or was empty and *setval contains it;
- *	   %0 if the string was not valid.
- */
-static int simple_getbool(char *s, bool *setval)
-{
-	if (s) {
-		if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
-			*setval = true;
-		else if (!strcmp(s, "0") || !strcmp(s, "no") ||
-							!strcmp(s, "false"))
-			*setval = false;
-		else
-			return 0;
-	} else
-		*setval = true;
-	return 1;
-}
-
-/**
- * parse_options - parse the (re)mount options
- * @vol:	ntfs volume
- * @opt:	string containing the (re)mount options
- *
- * Parse the recognized options in @opt for the ntfs volume described by @vol.
- */
-static bool parse_options(ntfs_volume *vol, char *opt)
-{
-	char *p, *v, *ov;
-	static char *utf8 = "utf8";
-	int errors = 0, sloppy = 0;
-	kuid_t uid = INVALID_UID;
-	kgid_t gid = INVALID_GID;
-	umode_t fmask = (umode_t)-1, dmask = (umode_t)-1;
-	int mft_zone_multiplier = -1, on_errors = -1;
-	int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
-	struct nls_table *nls_map = NULL, *old_nls;
-
-	/* I am lazy... (-8 */
-#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value)	\
-	if (!strcmp(p, option)) {					\
-		if (!v || !*v)						\
-			variable = default_value;			\
-		else {							\
-			variable = simple_strtoul(ov = v, &v, 0);	\
-			if (*v)						\
-				goto needs_val;				\
-		}							\
-	}
-#define NTFS_GETOPT(option, variable)					\
-	if (!strcmp(p, option)) {					\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		variable = simple_strtoul(ov = v, &v, 0);		\
-		if (*v)							\
-			goto needs_val;					\
-	}
-#define NTFS_GETOPT_UID(option, variable)				\
-	if (!strcmp(p, option)) {					\
-		uid_t uid_value;					\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		uid_value = simple_strtoul(ov = v, &v, 0);		\
-		if (*v)							\
-			goto needs_val;					\
-		variable = make_kuid(current_user_ns(), uid_value);	\
-		if (!uid_valid(variable))				\
-			goto needs_val;					\
-	}
-#define NTFS_GETOPT_GID(option, variable)				\
-	if (!strcmp(p, option)) {					\
-		gid_t gid_value;					\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		gid_value = simple_strtoul(ov = v, &v, 0);		\
-		if (*v)							\
-			goto needs_val;					\
-		variable = make_kgid(current_user_ns(), gid_value);	\
-		if (!gid_valid(variable))				\
-			goto needs_val;					\
-	}
-#define NTFS_GETOPT_OCTAL(option, variable)				\
-	if (!strcmp(p, option)) {					\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		variable = simple_strtoul(ov = v, &v, 8);		\
-		if (*v)							\
-			goto needs_val;					\
-	}
-#define NTFS_GETOPT_BOOL(option, variable)				\
-	if (!strcmp(p, option)) {					\
-		bool val;						\
-		if (!simple_getbool(v, &val))				\
-			goto needs_bool;				\
-		variable = val;						\
-	}
-#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array)		\
-	if (!strcmp(p, option)) {					\
-		int _i;							\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		ov = v;							\
-		if (variable == -1)					\
-			variable = 0;					\
-		for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \
-			if (!strcmp(opt_array[_i].str, v)) {		\
-				variable |= opt_array[_i].val;		\
-				break;					\
-			}						\
-		if (!opt_array[_i].str || !*opt_array[_i].str)		\
-			goto needs_val;					\
-	}
-	if (!opt || !*opt)
-		goto no_mount_options;
-	ntfs_debug("Entering with mount options string: %s", opt);
-	while ((p = strsep(&opt, ","))) {
-		if ((v = strchr(p, '=')))
-			*v++ = 0;
-		NTFS_GETOPT_UID("uid", uid)
-		else NTFS_GETOPT_GID("gid", gid)
-		else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
-		else NTFS_GETOPT_OCTAL("fmask", fmask)
-		else NTFS_GETOPT_OCTAL("dmask", dmask)
-		else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
-		else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true)
-		else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
-		else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
-		else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse)
-		else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
-				on_errors_arr)
-		else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
-			ntfs_warning(vol->sb, "Ignoring obsolete option %s.",
-					p);
-		else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) {
-			if (!strcmp(p, "iocharset"))
-				ntfs_warning(vol->sb, "Option iocharset is "
-						"deprecated. Please use "
-						"option nls=<charsetname> in "
-						"the future.");
-			if (!v || !*v)
-				goto needs_arg;
-use_utf8:
-			old_nls = nls_map;
-			nls_map = load_nls(v);
-			if (!nls_map) {
-				if (!old_nls) {
-					ntfs_error(vol->sb, "NLS character set "
-							"%s not found.", v);
-					return false;
-				}
-				ntfs_error(vol->sb, "NLS character set %s not "
-						"found. Using previous one %s.",
-						v, old_nls->charset);
-				nls_map = old_nls;
-			} else /* nls_map */ {
-				unload_nls(old_nls);
-			}
-		} else if (!strcmp(p, "utf8")) {
-			bool val = false;
-			ntfs_warning(vol->sb, "Option utf8 is no longer "
-				   "supported, using option nls=utf8. Please "
-				   "use option nls=utf8 in the future and "
-				   "make sure utf8 is compiled either as a "
-				   "module or into the kernel.");
-			if (!v || !*v)
-				val = true;
-			else if (!simple_getbool(v, &val))
-				goto needs_bool;
-			if (val) {
-				v = utf8;
-				goto use_utf8;
-			}
-		} else {
-			ntfs_error(vol->sb, "Unrecognized mount option %s.", p);
-			if (errors < INT_MAX)
-				errors++;
-		}
-#undef NTFS_GETOPT_OPTIONS_ARRAY
-#undef NTFS_GETOPT_BOOL
-#undef NTFS_GETOPT
-#undef NTFS_GETOPT_WITH_DEFAULT
-	}
-no_mount_options:
-	if (errors && !sloppy)
-		return false;
-	if (sloppy)
-		ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
-				"unrecognized mount option(s) and continuing.");
-	/* Keep this first! */
-	if (on_errors != -1) {
-		if (!on_errors) {
-			ntfs_error(vol->sb, "Invalid errors option argument "
-					"or bug in options parser.");
-			return false;
-		}
-	}
-	if (nls_map) {
-		if (vol->nls_map && vol->nls_map != nls_map) {
-			ntfs_error(vol->sb, "Cannot change NLS character set "
-					"on remount.");
-			return false;
-		} /* else (!vol->nls_map) */
-		ntfs_debug("Using NLS character set %s.", nls_map->charset);
-		vol->nls_map = nls_map;
-	} else /* (!nls_map) */ {
-		if (!vol->nls_map) {
-			vol->nls_map = load_nls_default();
-			if (!vol->nls_map) {
-				ntfs_error(vol->sb, "Failed to load default "
-						"NLS character set.");
-				return false;
-			}
-			ntfs_debug("Using default NLS character set (%s).",
-					vol->nls_map->charset);
-		}
-	}
-	if (mft_zone_multiplier != -1) {
-		if (vol->mft_zone_multiplier && vol->mft_zone_multiplier !=
-				mft_zone_multiplier) {
-			ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
-					"on remount.");
-			return false;
-		}
-		if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
-			ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
-					"Using default value, i.e. 1.");
-			mft_zone_multiplier = 1;
-		}
-		vol->mft_zone_multiplier = mft_zone_multiplier;
-	}
-	if (!vol->mft_zone_multiplier)
-		vol->mft_zone_multiplier = 1;
-	if (on_errors != -1)
-		vol->on_errors = on_errors;
-	if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
-		vol->on_errors |= ON_ERRORS_CONTINUE;
-	if (uid_valid(uid))
-		vol->uid = uid;
-	if (gid_valid(gid))
-		vol->gid = gid;
-	if (fmask != (umode_t)-1)
-		vol->fmask = fmask;
-	if (dmask != (umode_t)-1)
-		vol->dmask = dmask;
-	if (show_sys_files != -1) {
-		if (show_sys_files)
-			NVolSetShowSystemFiles(vol);
-		else
-			NVolClearShowSystemFiles(vol);
-	}
-	if (case_sensitive != -1) {
-		if (case_sensitive)
-			NVolSetCaseSensitive(vol);
-		else
-			NVolClearCaseSensitive(vol);
-	}
-	if (disable_sparse != -1) {
-		if (disable_sparse)
-			NVolClearSparseEnabled(vol);
-		else {
-			if (!NVolSparseEnabled(vol) &&
-					vol->major_ver && vol->major_ver < 3)
-				ntfs_warning(vol->sb, "Not enabling sparse "
-						"support due to NTFS volume "
-						"version %i.%i (need at least "
-						"version 3.0).", vol->major_ver,
-						vol->minor_ver);
-			else
-				NVolSetSparseEnabled(vol);
-		}
-	}
-	return true;
-needs_arg:
-	ntfs_error(vol->sb, "The %s option requires an argument.", p);
-	return false;
-needs_bool:
-	ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
-	return false;
-needs_val:
-	ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
-	return false;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_write_volume_flags - write new flags to the volume information flags
- * @vol:	ntfs volume on which to modify the flags
- * @flags:	new flags value for the volume information flags
- *
- * Internal function.  You probably want to use ntfs_{set,clear}_volume_flags()
- * instead (see below).
- *
- * Replace the volume information flags on the volume @vol with the value
- * supplied in @flags.  Note, this overwrites the volume information flags, so
- * make sure to combine the flags you want to modify with the old flags and use
- * the result when calling ntfs_write_volume_flags().
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags)
-{
-	ntfs_inode *ni = NTFS_I(vol->vol_ino);
-	MFT_RECORD *m;
-	VOLUME_INFORMATION *vi;
-	ntfs_attr_search_ctx *ctx;
-	int err;
-
-	ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.",
-			le16_to_cpu(vol->vol_flags), le16_to_cpu(flags));
-	if (vol->vol_flags == flags)
-		goto done;
-	BUG_ON(!ni);
-	m = map_mft_record(ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto put_unm_err_out;
-	}
-	err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
-			ctx);
-	if (err)
-		goto put_unm_err_out;
-	vi = (VOLUME_INFORMATION*)((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	vol->vol_flags = vi->flags = flags;
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ni);
-done:
-	ntfs_debug("Done.");
-	return 0;
-put_unm_err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ni);
-err_out:
-	ntfs_error(vol->sb, "Failed with error code %i.", -err);
-	return err;
-}
-
-/**
- * ntfs_set_volume_flags - set bits in the volume information flags
- * @vol:	ntfs volume on which to modify the flags
- * @flags:	flags to set on the volume
- *
- * Set the bits in @flags in the volume information flags on the volume @vol.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
-{
-	flags &= VOLUME_FLAGS_MASK;
-	return ntfs_write_volume_flags(vol, vol->vol_flags | flags);
-}
-
-/**
- * ntfs_clear_volume_flags - clear bits in the volume information flags
- * @vol:	ntfs volume on which to modify the flags
- * @flags:	flags to clear on the volume
- *
- * Clear the bits in @flags in the volume information flags on the volume @vol.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
-{
-	flags &= VOLUME_FLAGS_MASK;
-	flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags));
-	return ntfs_write_volume_flags(vol, flags);
-}
-
-#endif /* NTFS_RW */
-
-/**
- * ntfs_remount - change the mount options of a mounted ntfs filesystem
- * @sb:		superblock of mounted ntfs filesystem
- * @flags:	remount flags
- * @opt:	remount options string
- *
- * Change the mount options of an already mounted ntfs filesystem.
- *
- * NOTE:  The VFS sets the @sb->s_flags remount flags to @flags after
- * ntfs_remount() returns successfully (i.e. returns 0).  Otherwise,
- * @sb->s_flags are not changed.
- */
-static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
-{
-	ntfs_volume *vol = NTFS_SB(sb);
-
-	ntfs_debug("Entering with remount options string: %s", opt);
-
-	sync_filesystem(sb);
-
-#ifndef NTFS_RW
-	/* For read-only compiled driver, enforce read-only flag. */
-	*flags |= SB_RDONLY;
-#else /* NTFS_RW */
-	/*
-	 * For the read-write compiled driver, if we are remounting read-write,
-	 * make sure there are no volume errors and that no unsupported volume
-	 * flags are set.  Also, empty the logfile journal as it would become
-	 * stale as soon as something is written to the volume and mark the
-	 * volume dirty so that chkdsk is run if the volume is not umounted
-	 * cleanly.  Finally, mark the quotas out of date so Windows rescans
-	 * the volume on boot and updates them.
-	 *
-	 * When remounting read-only, mark the volume clean if no volume errors
-	 * have occurred.
-	 */
-	if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
-		static const char *es = ".  Cannot remount read-write.";
-
-		/* Remounting read-write. */
-		if (NVolErrors(vol)) {
-			ntfs_error(sb, "Volume has errors and is read-only%s",
-					es);
-			return -EROFS;
-		}
-		if (vol->vol_flags & VOLUME_IS_DIRTY) {
-			ntfs_error(sb, "Volume is dirty and read-only%s", es);
-			return -EROFS;
-		}
-		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
-			ntfs_error(sb, "Volume has been modified by chkdsk "
-					"and is read-only%s", es);
-			return -EROFS;
-		}
-		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
-			ntfs_error(sb, "Volume has unsupported flags set "
-					"(0x%x) and is read-only%s",
-					(unsigned)le16_to_cpu(vol->vol_flags),
-					es);
-			return -EROFS;
-		}
-		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
-			ntfs_error(sb, "Failed to set dirty bit in volume "
-					"information flags%s", es);
-			return -EROFS;
-		}
-#if 0
-		// TODO: Enable this code once we start modifying anything that
-		//	 is different between NTFS 1.2 and 3.x...
-		/* Set NT4 compatibility flag on newer NTFS version volumes. */
-		if ((vol->major_ver > 1)) {
-			if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
-				ntfs_error(sb, "Failed to set NT4 "
-						"compatibility flag%s", es);
-				NVolSetErrors(vol);
-				return -EROFS;
-			}
-		}
-#endif
-		if (!ntfs_empty_logfile(vol->logfile_ino)) {
-			ntfs_error(sb, "Failed to empty journal $LogFile%s",
-					es);
-			NVolSetErrors(vol);
-			return -EROFS;
-		}
-		if (!ntfs_mark_quotas_out_of_date(vol)) {
-			ntfs_error(sb, "Failed to mark quotas out of date%s",
-					es);
-			NVolSetErrors(vol);
-			return -EROFS;
-		}
-		if (!ntfs_stamp_usnjrnl(vol)) {
-			ntfs_error(sb, "Failed to stamp transaction log "
-					"($UsnJrnl)%s", es);
-			NVolSetErrors(vol);
-			return -EROFS;
-		}
-	} else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
-		/* Remounting read-only. */
-		if (!NVolErrors(vol)) {
-			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
-				ntfs_warning(sb, "Failed to clear dirty bit "
-						"in volume information "
-						"flags.  Run chkdsk.");
-		}
-	}
-#endif /* NTFS_RW */
-
-	// TODO: Deal with *flags.
-
-	if (!parse_options(vol, opt))
-		return -EINVAL;
-
-	ntfs_debug("Done.");
-	return 0;
-}
-
-/**
- * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
- * @sb:		Super block of the device to which @b belongs.
- * @b:		Boot sector of device @sb to check.
- * @silent:	If 'true', all output will be silenced.
- *
- * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
- * sector. Returns 'true' if it is valid and 'false' if not.
- *
- * @sb is only needed for warning/error output, i.e. it can be NULL when silent
- * is 'true'.
- */
-static bool is_boot_sector_ntfs(const struct super_block *sb,
-		const NTFS_BOOT_SECTOR *b, const bool silent)
-{
-	/*
-	 * Check that checksum == sum of u32 values from b to the checksum
-	 * field.  If checksum is zero, no checking is done.  We will work when
-	 * the checksum test fails, since some utilities update the boot sector
-	 * ignoring the checksum which leaves the checksum out-of-date.  We
-	 * report a warning if this is the case.
-	 */
-	if ((void*)b < (void*)&b->checksum && b->checksum && !silent) {
-		le32 *u;
-		u32 i;
-
-		for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
-			i += le32_to_cpup(u);
-		if (le32_to_cpu(b->checksum) != i)
-			ntfs_warning(sb, "Invalid boot sector checksum.");
-	}
-	/* Check OEMidentifier is "NTFS    " */
-	if (b->oem_id != magicNTFS)
-		goto not_ntfs;
-	/* Check bytes per sector value is between 256 and 4096. */
-	if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 ||
-			le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000)
-		goto not_ntfs;
-	/* Check sectors per cluster value is valid. */
-	switch (b->bpb.sectors_per_cluster) {
-	case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128:
-		break;
-	default:
-		goto not_ntfs;
-	}
-	/* Check the cluster size is not above the maximum (64kiB). */
-	if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
-			b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE)
-		goto not_ntfs;
-	/* Check reserved/unused fields are really zero. */
-	if (le16_to_cpu(b->bpb.reserved_sectors) ||
-			le16_to_cpu(b->bpb.root_entries) ||
-			le16_to_cpu(b->bpb.sectors) ||
-			le16_to_cpu(b->bpb.sectors_per_fat) ||
-			le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats)
-		goto not_ntfs;
-	/* Check clusters per file mft record value is valid. */
-	if ((u8)b->clusters_per_mft_record < 0xe1 ||
-			(u8)b->clusters_per_mft_record > 0xf7)
-		switch (b->clusters_per_mft_record) {
-		case 1: case 2: case 4: case 8: case 16: case 32: case 64:
-			break;
-		default:
-			goto not_ntfs;
-		}
-	/* Check clusters per index block value is valid. */
-	if ((u8)b->clusters_per_index_record < 0xe1 ||
-			(u8)b->clusters_per_index_record > 0xf7)
-		switch (b->clusters_per_index_record) {
-		case 1: case 2: case 4: case 8: case 16: case 32: case 64:
-			break;
-		default:
-			goto not_ntfs;
-		}
-	/*
-	 * Check for valid end of sector marker. We will work without it, but
-	 * many BIOSes will refuse to boot from a bootsector if the magic is
-	 * incorrect, so we emit a warning.
-	 */
-	if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
-		ntfs_warning(sb, "Invalid end of sector marker.");
-	return true;
-not_ntfs:
-	return false;
-}
-
-/**
- * read_ntfs_boot_sector - read the NTFS boot sector of a device
- * @sb:		super block of device to read the boot sector from
- * @silent:	if true, suppress all output
- *
- * Reads the boot sector from the device and validates it. If that fails, tries
- * to read the backup boot sector, first from the end of the device a-la NT4 and
- * later and then from the middle of the device a-la NT3.51 and before.
- *
- * If a valid boot sector is found but it is not the primary boot sector, we
- * repair the primary boot sector silently (unless the device is read-only or
- * the primary boot sector is not accessible).
- *
- * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super
- * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized
- * to their respective values.
- *
- * Return the unlocked buffer head containing the boot sector or NULL on error.
- */
-static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
-		const int silent)
-{
-	const char *read_err_str = "Unable to read %s boot sector.";
-	struct buffer_head *bh_primary, *bh_backup;
-	sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
-
-	/* Try to read primary boot sector. */
-	if ((bh_primary = sb_bread(sb, 0))) {
-		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
-				bh_primary->b_data, silent))
-			return bh_primary;
-		if (!silent)
-			ntfs_error(sb, "Primary boot sector is invalid.");
-	} else if (!silent)
-		ntfs_error(sb, read_err_str, "primary");
-	if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) {
-		if (bh_primary)
-			brelse(bh_primary);
-		if (!silent)
-			ntfs_error(sb, "Mount option errors=recover not used. "
-					"Aborting without trying to recover.");
-		return NULL;
-	}
-	/* Try to read NT4+ backup boot sector. */
-	if ((bh_backup = sb_bread(sb, nr_blocks - 1))) {
-		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
-				bh_backup->b_data, silent))
-			goto hotfix_primary_boot_sector;
-		brelse(bh_backup);
-	} else if (!silent)
-		ntfs_error(sb, read_err_str, "backup");
-	/* Try to read NT3.51- backup boot sector. */
-	if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) {
-		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
-				bh_backup->b_data, silent))
-			goto hotfix_primary_boot_sector;
-		if (!silent)
-			ntfs_error(sb, "Could not find a valid backup boot "
-					"sector.");
-		brelse(bh_backup);
-	} else if (!silent)
-		ntfs_error(sb, read_err_str, "backup");
-	/* We failed. Cleanup and return. */
-	if (bh_primary)
-		brelse(bh_primary);
-	return NULL;
-hotfix_primary_boot_sector:
-	if (bh_primary) {
-		/*
-		 * If we managed to read sector zero and the volume is not
-		 * read-only, copy the found, valid backup boot sector to the
-		 * primary boot sector.  Note we only copy the actual boot
-		 * sector structure, not the actual whole device sector as that
-		 * may be bigger and would potentially damage the $Boot system
-		 * file (FIXME: Would be nice to know if the backup boot sector
-		 * on a large sector device contains the whole boot loader or
-		 * just the first 512 bytes).
-		 */
-		if (!sb_rdonly(sb)) {
-			ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
-					"boot sector from backup copy.");
-			memcpy(bh_primary->b_data, bh_backup->b_data,
-					NTFS_BLOCK_SIZE);
-			mark_buffer_dirty(bh_primary);
-			sync_dirty_buffer(bh_primary);
-			if (buffer_uptodate(bh_primary)) {
-				brelse(bh_backup);
-				return bh_primary;
-			}
-			ntfs_error(sb, "Hot-fix: Device write error while "
-					"recovering primary boot sector.");
-		} else {
-			ntfs_warning(sb, "Hot-fix: Recovery of primary boot "
-					"sector failed: Read-only mount.");
-		}
-		brelse(bh_primary);
-	}
-	ntfs_warning(sb, "Using backup boot sector.");
-	return bh_backup;
-}
-
-/**
- * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol
- * @vol:	volume structure to initialise with data from boot sector
- * @b:		boot sector to parse
- *
- * Parse the ntfs boot sector @b and store all imporant information therein in
- * the ntfs super block @vol.  Return 'true' on success and 'false' on error.
- */
-static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
-{
-	unsigned int sectors_per_cluster_bits, nr_hidden_sects;
-	int clusters_per_mft_record, clusters_per_index_record;
-	s64 ll;
-
-	vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector);
-	vol->sector_size_bits = ffs(vol->sector_size) - 1;
-	ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size,
-			vol->sector_size);
-	ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
-			vol->sector_size_bits);
-	if (vol->sector_size < vol->sb->s_blocksize) {
-		ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
-				"device block size (%lu).  This is not "
-				"supported.  Sorry.", vol->sector_size,
-				vol->sb->s_blocksize);
-		return false;
-	}
-	ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
-	sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
-	ntfs_debug("sectors_per_cluster_bits = 0x%x",
-			sectors_per_cluster_bits);
-	nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors);
-	ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects);
-	vol->cluster_size = vol->sector_size << sectors_per_cluster_bits;
-	vol->cluster_size_mask = vol->cluster_size - 1;
-	vol->cluster_size_bits = ffs(vol->cluster_size) - 1;
-	ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
-			vol->cluster_size);
-	ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
-	ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
-	if (vol->cluster_size < vol->sector_size) {
-		ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
-				"sector size (%i).  This is not supported.  "
-				"Sorry.", vol->cluster_size, vol->sector_size);
-		return false;
-	}
-	clusters_per_mft_record = b->clusters_per_mft_record;
-	ntfs_debug("clusters_per_mft_record = %i (0x%x)",
-			clusters_per_mft_record, clusters_per_mft_record);
-	if (clusters_per_mft_record > 0)
-		vol->mft_record_size = vol->cluster_size <<
-				(ffs(clusters_per_mft_record) - 1);
-	else
-		/*
-		 * When mft_record_size < cluster_size, clusters_per_mft_record
-		 * = -log2(mft_record_size) bytes. mft_record_size normaly is
-		 * 1024 bytes, which is encoded as 0xF6 (-10 in decimal).
-		 */
-		vol->mft_record_size = 1 << -clusters_per_mft_record;
-	vol->mft_record_size_mask = vol->mft_record_size - 1;
-	vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1;
-	ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size,
-			vol->mft_record_size);
-	ntfs_debug("vol->mft_record_size_mask = 0x%x",
-			vol->mft_record_size_mask);
-	ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
-			vol->mft_record_size_bits, vol->mft_record_size_bits);
-	/*
-	 * We cannot support mft record sizes above the PAGE_SIZE since
-	 * we store $MFT/$DATA, the table of mft records in the page cache.
-	 */
-	if (vol->mft_record_size > PAGE_SIZE) {
-		ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
-				"PAGE_SIZE on your system (%lu).  "
-				"This is not supported.  Sorry.",
-				vol->mft_record_size, PAGE_SIZE);
-		return false;
-	}
-	/* We cannot support mft record sizes below the sector size. */
-	if (vol->mft_record_size < vol->sector_size) {
-		ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
-				"sector size (%i).  This is not supported.  "
-				"Sorry.", vol->mft_record_size,
-				vol->sector_size);
-		return false;
-	}
-	clusters_per_index_record = b->clusters_per_index_record;
-	ntfs_debug("clusters_per_index_record = %i (0x%x)",
-			clusters_per_index_record, clusters_per_index_record);
-	if (clusters_per_index_record > 0)
-		vol->index_record_size = vol->cluster_size <<
-				(ffs(clusters_per_index_record) - 1);
-	else
-		/*
-		 * When index_record_size < cluster_size,
-		 * clusters_per_index_record = -log2(index_record_size) bytes.
-		 * index_record_size normaly equals 4096 bytes, which is
-		 * encoded as 0xF4 (-12 in decimal).
-		 */
-		vol->index_record_size = 1 << -clusters_per_index_record;
-	vol->index_record_size_mask = vol->index_record_size - 1;
-	vol->index_record_size_bits = ffs(vol->index_record_size) - 1;
-	ntfs_debug("vol->index_record_size = %i (0x%x)",
-			vol->index_record_size, vol->index_record_size);
-	ntfs_debug("vol->index_record_size_mask = 0x%x",
-			vol->index_record_size_mask);
-	ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
-			vol->index_record_size_bits,
-			vol->index_record_size_bits);
-	/* We cannot support index record sizes below the sector size. */
-	if (vol->index_record_size < vol->sector_size) {
-		ntfs_error(vol->sb, "Index record size (%i) is smaller than "
-				"the sector size (%i).  This is not "
-				"supported.  Sorry.", vol->index_record_size,
-				vol->sector_size);
-		return false;
-	}
-	/*
-	 * Get the size of the volume in clusters and check for 64-bit-ness.
-	 * Windows currently only uses 32 bits to save the clusters so we do
-	 * the same as it is much faster on 32-bit CPUs.
-	 */
-	ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
-	if ((u64)ll >= 1ULL << 32) {
-		ntfs_error(vol->sb, "Cannot handle 64-bit clusters.  Sorry.");
-		return false;
-	}
-	vol->nr_clusters = ll;
-	ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
-	/*
-	 * On an architecture where unsigned long is 32-bits, we restrict the
-	 * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler
-	 * will hopefully optimize the whole check away.
-	 */
-	if (sizeof(unsigned long) < 8) {
-		if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) {
-			ntfs_error(vol->sb, "Volume size (%lluTiB) is too "
-					"large for this architecture.  "
-					"Maximum supported is 2TiB.  Sorry.",
-					(unsigned long long)ll >> (40 -
-					vol->cluster_size_bits));
-			return false;
-		}
-	}
-	ll = sle64_to_cpu(b->mft_lcn);
-	if (ll >= vol->nr_clusters) {
-		ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
-				"volume.  Weird.", (unsigned long long)ll,
-				(unsigned long long)ll);
-		return false;
-	}
-	vol->mft_lcn = ll;
-	ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
-	ll = sle64_to_cpu(b->mftmirr_lcn);
-	if (ll >= vol->nr_clusters) {
-		ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
-				"of volume.  Weird.", (unsigned long long)ll,
-				(unsigned long long)ll);
-		return false;
-	}
-	vol->mftmirr_lcn = ll;
-	ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
-#ifdef NTFS_RW
-	/*
-	 * Work out the size of the mft mirror in number of mft records. If the
-	 * cluster size is less than or equal to the size taken by four mft
-	 * records, the mft mirror stores the first four mft records. If the
-	 * cluster size is bigger than the size taken by four mft records, the
-	 * mft mirror contains as many mft records as will fit into one
-	 * cluster.
-	 */
-	if (vol->cluster_size <= (4 << vol->mft_record_size_bits))
-		vol->mftmirr_size = 4;
-	else
-		vol->mftmirr_size = vol->cluster_size >>
-				vol->mft_record_size_bits;
-	ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size);
-#endif /* NTFS_RW */
-	vol->serial_no = le64_to_cpu(b->volume_serial_number);
-	ntfs_debug("vol->serial_no = 0x%llx",
-			(unsigned long long)vol->serial_no);
-	return true;
-}
-
-/**
- * ntfs_setup_allocators - initialize the cluster and mft allocators
- * @vol:	volume structure for which to setup the allocators
- *
- * Setup the cluster (lcn) and mft allocators to the starting values.
- */
-static void ntfs_setup_allocators(ntfs_volume *vol)
-{
-#ifdef NTFS_RW
-	LCN mft_zone_size, mft_lcn;
-#endif /* NTFS_RW */
-
-	ntfs_debug("vol->mft_zone_multiplier = 0x%x",
-			vol->mft_zone_multiplier);
-#ifdef NTFS_RW
-	/* Determine the size of the MFT zone. */
-	mft_zone_size = vol->nr_clusters;
-	switch (vol->mft_zone_multiplier) {  /* % of volume size in clusters */
-	case 4:
-		mft_zone_size >>= 1;			/* 50%   */
-		break;
-	case 3:
-		mft_zone_size = (mft_zone_size +
-				(mft_zone_size >> 1)) >> 2;	/* 37.5% */
-		break;
-	case 2:
-		mft_zone_size >>= 2;			/* 25%   */
-		break;
-	/* case 1: */
-	default:
-		mft_zone_size >>= 3;			/* 12.5% */
-		break;
-	}
-	/* Setup the mft zone. */
-	vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn;
-	ntfs_debug("vol->mft_zone_pos = 0x%llx",
-			(unsigned long long)vol->mft_zone_pos);
-	/*
-	 * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs
-	 * source) and if the actual mft_lcn is in the expected place or even
-	 * further to the front of the volume, extend the mft_zone to cover the
-	 * beginning of the volume as well.  This is in order to protect the
-	 * area reserved for the mft bitmap as well within the mft_zone itself.
-	 * On non-standard volumes we do not protect it as the overhead would
-	 * be higher than the speed increase we would get by doing it.
-	 */
-	mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size;
-	if (mft_lcn * vol->cluster_size < 16 * 1024)
-		mft_lcn = (16 * 1024 + vol->cluster_size - 1) /
-				vol->cluster_size;
-	if (vol->mft_zone_start <= mft_lcn)
-		vol->mft_zone_start = 0;
-	ntfs_debug("vol->mft_zone_start = 0x%llx",
-			(unsigned long long)vol->mft_zone_start);
-	/*
-	 * Need to cap the mft zone on non-standard volumes so that it does
-	 * not point outside the boundaries of the volume.  We do this by
-	 * halving the zone size until we are inside the volume.
-	 */
-	vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
-	while (vol->mft_zone_end >= vol->nr_clusters) {
-		mft_zone_size >>= 1;
-		vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
-	}
-	ntfs_debug("vol->mft_zone_end = 0x%llx",
-			(unsigned long long)vol->mft_zone_end);
-	/*
-	 * Set the current position within each data zone to the start of the
-	 * respective zone.
-	 */
-	vol->data1_zone_pos = vol->mft_zone_end;
-	ntfs_debug("vol->data1_zone_pos = 0x%llx",
-			(unsigned long long)vol->data1_zone_pos);
-	vol->data2_zone_pos = 0;
-	ntfs_debug("vol->data2_zone_pos = 0x%llx",
-			(unsigned long long)vol->data2_zone_pos);
-
-	/* Set the mft data allocation position to mft record 24. */
-	vol->mft_data_pos = 24;
-	ntfs_debug("vol->mft_data_pos = 0x%llx",
-			(unsigned long long)vol->mft_data_pos);
-#endif /* NTFS_RW */
-}
-
-#ifdef NTFS_RW
-
-/**
- * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
- * @vol:	ntfs super block describing device whose mft mirror to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_mft_mirror(ntfs_volume *vol)
-{
-	struct inode *tmp_ino;
-	ntfs_inode *tmp_ni;
-
-	ntfs_debug("Entering.");
-	/* Get mft mirror inode. */
-	tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr);
-	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
-		if (!IS_ERR(tmp_ino))
-			iput(tmp_ino);
-		/* Caller will display error message. */
-		return false;
-	}
-	/*
-	 * Re-initialize some specifics about $MFTMirr's inode as
-	 * ntfs_read_inode() will have set up the default ones.
-	 */
-	/* Set uid and gid to root. */
-	tmp_ino->i_uid = GLOBAL_ROOT_UID;
-	tmp_ino->i_gid = GLOBAL_ROOT_GID;
-	/* Regular file.  No access for anyone. */
-	tmp_ino->i_mode = S_IFREG;
-	/* No VFS initiated operations allowed for $MFTMirr. */
-	tmp_ino->i_op = &ntfs_empty_inode_ops;
-	tmp_ino->i_fop = &ntfs_empty_file_ops;
-	/* Put in our special address space operations. */
-	tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
-	tmp_ni = NTFS_I(tmp_ino);
-	/* The $MFTMirr, like the $MFT is multi sector transfer protected. */
-	NInoSetMstProtected(tmp_ni);
-	NInoSetSparseDisabled(tmp_ni);
-	/*
-	 * Set up our little cheat allowing us to reuse the async read io
-	 * completion handler for directories.
-	 */
-	tmp_ni->itype.index.block_size = vol->mft_record_size;
-	tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
-	vol->mftmirr_ino = tmp_ino;
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * check_mft_mirror - compare contents of the mft mirror with the mft
- * @vol:	ntfs super block describing device whose mft mirror to check
- *
- * Return 'true' on success or 'false' on error.
- *
- * Note, this function also results in the mft mirror runlist being completely
- * mapped into memory.  The mft mirror write code requires this and will BUG()
- * should it find an unmapped runlist element.
- */
-static bool check_mft_mirror(ntfs_volume *vol)
-{
-	struct super_block *sb = vol->sb;
-	ntfs_inode *mirr_ni;
-	struct page *mft_page, *mirr_page;
-	u8 *kmft, *kmirr;
-	runlist_element *rl, rl2[2];
-	pgoff_t index;
-	int mrecs_per_page, i;
-
-	ntfs_debug("Entering.");
-	/* Compare contents of $MFT and $MFTMirr. */
-	mrecs_per_page = PAGE_SIZE / vol->mft_record_size;
-	BUG_ON(!mrecs_per_page);
-	BUG_ON(!vol->mftmirr_size);
-	mft_page = mirr_page = NULL;
-	kmft = kmirr = NULL;
-	index = i = 0;
-	do {
-		u32 bytes;
-
-		/* Switch pages if necessary. */
-		if (!(i % mrecs_per_page)) {
-			if (index) {
-				ntfs_unmap_page(mft_page);
-				ntfs_unmap_page(mirr_page);
-			}
-			/* Get the $MFT page. */
-			mft_page = ntfs_map_page(vol->mft_ino->i_mapping,
-					index);
-			if (IS_ERR(mft_page)) {
-				ntfs_error(sb, "Failed to read $MFT.");
-				return false;
-			}
-			kmft = page_address(mft_page);
-			/* Get the $MFTMirr page. */
-			mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping,
-					index);
-			if (IS_ERR(mirr_page)) {
-				ntfs_error(sb, "Failed to read $MFTMirr.");
-				goto mft_unmap_out;
-			}
-			kmirr = page_address(mirr_page);
-			++index;
-		}
-		/* Do not check the record if it is not in use. */
-		if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) {
-			/* Make sure the record is ok. */
-			if (ntfs_is_baad_recordp((le32*)kmft)) {
-				ntfs_error(sb, "Incomplete multi sector "
-						"transfer detected in mft "
-						"record %i.", i);
-mm_unmap_out:
-				ntfs_unmap_page(mirr_page);
-mft_unmap_out:
-				ntfs_unmap_page(mft_page);
-				return false;
-			}
-		}
-		/* Do not check the mirror record if it is not in use. */
-		if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) {
-			if (ntfs_is_baad_recordp((le32*)kmirr)) {
-				ntfs_error(sb, "Incomplete multi sector "
-						"transfer detected in mft "
-						"mirror record %i.", i);
-				goto mm_unmap_out;
-			}
-		}
-		/* Get the amount of data in the current record. */
-		bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
-		if (bytes < sizeof(MFT_RECORD_OLD) ||
-				bytes > vol->mft_record_size ||
-				ntfs_is_baad_recordp((le32*)kmft)) {
-			bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
-			if (bytes < sizeof(MFT_RECORD_OLD) ||
-					bytes > vol->mft_record_size ||
-					ntfs_is_baad_recordp((le32*)kmirr))
-				bytes = vol->mft_record_size;
-		}
-		/* Compare the two records. */
-		if (memcmp(kmft, kmirr, bytes)) {
-			ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not "
-					"match.  Run ntfsfix or chkdsk.", i);
-			goto mm_unmap_out;
-		}
-		kmft += vol->mft_record_size;
-		kmirr += vol->mft_record_size;
-	} while (++i < vol->mftmirr_size);
-	/* Release the last pages. */
-	ntfs_unmap_page(mft_page);
-	ntfs_unmap_page(mirr_page);
-
-	/* Construct the mft mirror runlist by hand. */
-	rl2[0].vcn = 0;
-	rl2[0].lcn = vol->mftmirr_lcn;
-	rl2[0].length = (vol->mftmirr_size * vol->mft_record_size +
-			vol->cluster_size - 1) / vol->cluster_size;
-	rl2[1].vcn = rl2[0].length;
-	rl2[1].lcn = LCN_ENOENT;
-	rl2[1].length = 0;
-	/*
-	 * Because we have just read all of the mft mirror, we know we have
-	 * mapped the full runlist for it.
-	 */
-	mirr_ni = NTFS_I(vol->mftmirr_ino);
-	down_read(&mirr_ni->runlist.lock);
-	rl = mirr_ni->runlist.rl;
-	/* Compare the two runlists.  They must be identical. */
-	i = 0;
-	do {
-		if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn ||
-				rl2[i].length != rl[i].length) {
-			ntfs_error(sb, "$MFTMirr location mismatch.  "
-					"Run chkdsk.");
-			up_read(&mirr_ni->runlist.lock);
-			return false;
-		}
-	} while (rl2[i++].length);
-	up_read(&mirr_ni->runlist.lock);
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * load_and_check_logfile - load and check the logfile inode for a volume
- * @vol:	ntfs super block describing device whose logfile to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_check_logfile(ntfs_volume *vol,
-		RESTART_PAGE_HEADER **rp)
-{
-	struct inode *tmp_ino;
-
-	ntfs_debug("Entering.");
-	tmp_ino = ntfs_iget(vol->sb, FILE_LogFile);
-	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
-		if (!IS_ERR(tmp_ino))
-			iput(tmp_ino);
-		/* Caller will display error message. */
-		return false;
-	}
-	if (!ntfs_check_logfile(tmp_ino, rp)) {
-		iput(tmp_ino);
-		/* ntfs_check_logfile() will have displayed error output. */
-		return false;
-	}
-	NInoSetSparseDisabled(NTFS_I(tmp_ino));
-	vol->logfile_ino = tmp_ino;
-	ntfs_debug("Done.");
-	return true;
-}
-
-#define NTFS_HIBERFIL_HEADER_SIZE	4096
-
-/**
- * check_windows_hibernation_status - check if Windows is suspended on a volume
- * @vol:	ntfs super block of device to check
- *
- * Check if Windows is hibernated on the ntfs volume @vol.  This is done by
- * looking for the file hiberfil.sys in the root directory of the volume.  If
- * the file is not present Windows is definitely not suspended.
- *
- * If hiberfil.sys exists and is less than 4kiB in size it means Windows is
- * definitely suspended (this volume is not the system volume).  Caveat:  on a
- * system with many volumes it is possible that the < 4kiB check is bogus but
- * for now this should do fine.
- *
- * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the
- * hiberfil header (which is the first 4kiB).  If this begins with "hibr",
- * Windows is definitely suspended.  If it is completely full of zeroes,
- * Windows is definitely not hibernated.  Any other case is treated as if
- * Windows is suspended.  This caters for the above mentioned caveat of a
- * system with many volumes where no "hibr" magic would be present and there is
- * no zero header.
- *
- * Return 0 if Windows is not hibernated on the volume, >0 if Windows is
- * hibernated on the volume, and -errno on error.
- */
-static int check_windows_hibernation_status(ntfs_volume *vol)
-{
-	MFT_REF mref;
-	struct inode *vi;
-	struct page *page;
-	u32 *kaddr, *kend;
-	ntfs_name *name = NULL;
-	int ret = 1;
-	static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
-			cpu_to_le16('i'), cpu_to_le16('b'),
-			cpu_to_le16('e'), cpu_to_le16('r'),
-			cpu_to_le16('f'), cpu_to_le16('i'),
-			cpu_to_le16('l'), cpu_to_le16('.'),
-			cpu_to_le16('s'), cpu_to_le16('y'),
-			cpu_to_le16('s'), 0 };
-
-	ntfs_debug("Entering.");
-	/*
-	 * Find the inode number for the hibernation file by looking up the
-	 * filename hiberfil.sys in the root directory.
-	 */
-	inode_lock(vol->root_ino);
-	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
-			&name);
-	inode_unlock(vol->root_ino);
-	if (IS_ERR_MREF(mref)) {
-		ret = MREF_ERR(mref);
-		/* If the file does not exist, Windows is not hibernated. */
-		if (ret == -ENOENT) {
-			ntfs_debug("hiberfil.sys not present.  Windows is not "
-					"hibernated on the volume.");
-			return 0;
-		}
-		/* A real error occurred. */
-		ntfs_error(vol->sb, "Failed to find inode number for "
-				"hiberfil.sys.");
-		return ret;
-	}
-	/* We do not care for the type of match that was found. */
-	kfree(name);
-	/* Get the inode. */
-	vi = ntfs_iget(vol->sb, MREF(mref));
-	if (IS_ERR(vi) || is_bad_inode(vi)) {
-		if (!IS_ERR(vi))
-			iput(vi);
-		ntfs_error(vol->sb, "Failed to load hiberfil.sys.");
-		return IS_ERR(vi) ? PTR_ERR(vi) : -EIO;
-	}
-	if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) {
-		ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx).  "
-				"Windows is hibernated on the volume.  This "
-				"is not the system volume.", i_size_read(vi));
-		goto iput_out;
-	}
-	page = ntfs_map_page(vi->i_mapping, 0);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
-		ret = PTR_ERR(page);
-		goto iput_out;
-	}
-	kaddr = (u32*)page_address(page);
-	if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
-		ntfs_debug("Magic \"hibr\" found in hiberfil.sys.  Windows is "
-				"hibernated on the volume.  This is the "
-				"system volume.");
-		goto unm_iput_out;
-	}
-	kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr);
-	do {
-		if (unlikely(*kaddr)) {
-			ntfs_debug("hiberfil.sys is larger than 4kiB "
-					"(0x%llx), does not contain the "
-					"\"hibr\" magic, and does not have a "
-					"zero header.  Windows is hibernated "
-					"on the volume.  This is not the "
-					"system volume.", i_size_read(vi));
-			goto unm_iput_out;
-		}
-	} while (++kaddr < kend);
-	ntfs_debug("hiberfil.sys contains a zero header.  Windows is not "
-			"hibernated on the volume.  This is the system "
-			"volume.");
-	ret = 0;
-unm_iput_out:
-	ntfs_unmap_page(page);
-iput_out:
-	iput(vi);
-	return ret;
-}
-
-/**
- * load_and_init_quota - load and setup the quota file for a volume if present
- * @vol:	ntfs super block describing device whose quota file to load
- *
- * Return 'true' on success or 'false' on error.  If $Quota is not present, we
- * leave vol->quota_ino as NULL and return success.
- */
-static bool load_and_init_quota(ntfs_volume *vol)
-{
-	MFT_REF mref;
-	struct inode *tmp_ino;
-	ntfs_name *name = NULL;
-	static const ntfschar Quota[7] = { cpu_to_le16('$'),
-			cpu_to_le16('Q'), cpu_to_le16('u'),
-			cpu_to_le16('o'), cpu_to_le16('t'),
-			cpu_to_le16('a'), 0 };
-	static ntfschar Q[3] = { cpu_to_le16('$'),
-			cpu_to_le16('Q'), 0 };
-
-	ntfs_debug("Entering.");
-	/*
-	 * Find the inode number for the quota file by looking up the filename
-	 * $Quota in the extended system files directory $Extend.
-	 */
-	inode_lock(vol->extend_ino);
-	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
-			&name);
-	inode_unlock(vol->extend_ino);
-	if (IS_ERR_MREF(mref)) {
-		/*
-		 * If the file does not exist, quotas are disabled and have
-		 * never been enabled on this volume, just return success.
-		 */
-		if (MREF_ERR(mref) == -ENOENT) {
-			ntfs_debug("$Quota not present.  Volume does not have "
-					"quotas enabled.");
-			/*
-			 * No need to try to set quotas out of date if they are
-			 * not enabled.
-			 */
-			NVolSetQuotaOutOfDate(vol);
-			return true;
-		}
-		/* A real error occurred. */
-		ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
-		return false;
-	}
-	/* We do not care for the type of match that was found. */
-	kfree(name);
-	/* Get the inode. */
-	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
-	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
-		if (!IS_ERR(tmp_ino))
-			iput(tmp_ino);
-		ntfs_error(vol->sb, "Failed to load $Quota.");
-		return false;
-	}
-	vol->quota_ino = tmp_ino;
-	/* Get the $Q index allocation attribute. */
-	tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
-	if (IS_ERR(tmp_ino)) {
-		ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
-		return false;
-	}
-	vol->quota_q_ino = tmp_ino;
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * load_and_init_usnjrnl - load and setup the transaction log if present
- * @vol:	ntfs super block describing device whose usnjrnl file to load
- *
- * Return 'true' on success or 'false' on error.
- *
- * If $UsnJrnl is not present or in the process of being disabled, we set
- * NVolUsnJrnlStamped() and return success.
- *
- * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn,
- * i.e. transaction logging has only just been enabled or the journal has been
- * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped()
- * and return success.
- */
-static bool load_and_init_usnjrnl(ntfs_volume *vol)
-{
-	MFT_REF mref;
-	struct inode *tmp_ino;
-	ntfs_inode *tmp_ni;
-	struct page *page;
-	ntfs_name *name = NULL;
-	USN_HEADER *uh;
-	static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
-			cpu_to_le16('U'), cpu_to_le16('s'),
-			cpu_to_le16('n'), cpu_to_le16('J'),
-			cpu_to_le16('r'), cpu_to_le16('n'),
-			cpu_to_le16('l'), 0 };
-	static ntfschar Max[5] = { cpu_to_le16('$'),
-			cpu_to_le16('M'), cpu_to_le16('a'),
-			cpu_to_le16('x'), 0 };
-	static ntfschar J[3] = { cpu_to_le16('$'),
-			cpu_to_le16('J'), 0 };
-
-	ntfs_debug("Entering.");
-	/*
-	 * Find the inode number for the transaction log file by looking up the
-	 * filename $UsnJrnl in the extended system files directory $Extend.
-	 */
-	inode_lock(vol->extend_ino);
-	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
-			&name);
-	inode_unlock(vol->extend_ino);
-	if (IS_ERR_MREF(mref)) {
-		/*
-		 * If the file does not exist, transaction logging is disabled,
-		 * just return success.
-		 */
-		if (MREF_ERR(mref) == -ENOENT) {
-			ntfs_debug("$UsnJrnl not present.  Volume does not "
-					"have transaction logging enabled.");
-not_enabled:
-			/*
-			 * No need to try to stamp the transaction log if
-			 * transaction logging is not enabled.
-			 */
-			NVolSetUsnJrnlStamped(vol);
-			return true;
-		}
-		/* A real error occurred. */
-		ntfs_error(vol->sb, "Failed to find inode number for "
-				"$UsnJrnl.");
-		return false;
-	}
-	/* We do not care for the type of match that was found. */
-	kfree(name);
-	/* Get the inode. */
-	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
-	if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) {
-		if (!IS_ERR(tmp_ino))
-			iput(tmp_ino);
-		ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
-		return false;
-	}
-	vol->usnjrnl_ino = tmp_ino;
-	/*
-	 * If the transaction log is in the process of being deleted, we can
-	 * ignore it.
-	 */
-	if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) {
-		ntfs_debug("$UsnJrnl in the process of being disabled.  "
-				"Volume does not have transaction logging "
-				"enabled.");
-		goto not_enabled;
-	}
-	/* Get the $DATA/$Max attribute. */
-	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4);
-	if (IS_ERR(tmp_ino)) {
-		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max "
-				"attribute.");
-		return false;
-	}
-	vol->usnjrnl_max_ino = tmp_ino;
-	if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
-		ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
-				"attribute (size is 0x%llx but should be at "
-				"least 0x%zx bytes).", i_size_read(tmp_ino),
-				sizeof(USN_HEADER));
-		return false;
-	}
-	/* Get the $DATA/$J attribute. */
-	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2);
-	if (IS_ERR(tmp_ino)) {
-		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J "
-				"attribute.");
-		return false;
-	}
-	vol->usnjrnl_j_ino = tmp_ino;
-	/* Verify $J is non-resident and sparse. */
-	tmp_ni = NTFS_I(vol->usnjrnl_j_ino);
-	if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) {
-		ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident "
-				"and/or not sparse.");
-		return false;
-	}
-	/* Read the USN_HEADER from $DATA/$Max. */
-	page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max "
-				"attribute.");
-		return false;
-	}
-	uh = (USN_HEADER*)page_address(page);
-	/* Sanity check the $Max. */
-	if (unlikely(sle64_to_cpu(uh->allocation_delta) >
-			sle64_to_cpu(uh->maximum_size))) {
-		ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds "
-				"maximum size (0x%llx).  $UsnJrnl is corrupt.",
-				(long long)sle64_to_cpu(uh->allocation_delta),
-				(long long)sle64_to_cpu(uh->maximum_size));
-		ntfs_unmap_page(page);
-		return false;
-	}
-	/*
-	 * If the transaction log has been stamped and nothing has been written
-	 * to it since, we do not need to stamp it.
-	 */
-	if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >=
-			i_size_read(vol->usnjrnl_j_ino))) {
-		if (likely(sle64_to_cpu(uh->lowest_valid_usn) ==
-				i_size_read(vol->usnjrnl_j_ino))) {
-			ntfs_unmap_page(page);
-			ntfs_debug("$UsnJrnl is enabled but nothing has been "
-					"logged since it was last stamped.  "
-					"Treating this as if the volume does "
-					"not have transaction logging "
-					"enabled.");
-			goto not_enabled;
-		}
-		ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) "
-				"which is out of bounds (0x%llx).  $UsnJrnl "
-				"is corrupt.",
-				(long long)sle64_to_cpu(uh->lowest_valid_usn),
-				i_size_read(vol->usnjrnl_j_ino));
-		ntfs_unmap_page(page);
-		return false;
-	}
-	ntfs_unmap_page(page);
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * load_and_init_attrdef - load the attribute definitions table for a volume
- * @vol:	ntfs super block describing device whose attrdef to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_attrdef(ntfs_volume *vol)
-{
-	loff_t i_size;
-	struct super_block *sb = vol->sb;
-	struct inode *ino;
-	struct page *page;
-	pgoff_t index, max_index;
-	unsigned int size;
-
-	ntfs_debug("Entering.");
-	/* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */
-	ino = ntfs_iget(sb, FILE_AttrDef);
-	if (IS_ERR(ino) || is_bad_inode(ino)) {
-		if (!IS_ERR(ino))
-			iput(ino);
-		goto failed;
-	}
-	NInoSetSparseDisabled(NTFS_I(ino));
-	/* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
-	i_size = i_size_read(ino);
-	if (i_size <= 0 || i_size > 0x7fffffff)
-		goto iput_failed;
-	vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size);
-	if (!vol->attrdef)
-		goto iput_failed;
-	index = 0;
-	max_index = i_size >> PAGE_SHIFT;
-	size = PAGE_SIZE;
-	while (index < max_index) {
-		/* Read the attrdef table and copy it into the linear buffer. */
-read_partial_attrdef_page:
-		page = ntfs_map_page(ino->i_mapping, index);
-		if (IS_ERR(page))
-			goto free_iput_failed;
-		memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT),
-				page_address(page), size);
-		ntfs_unmap_page(page);
-	}
-	if (size == PAGE_SIZE) {
-		size = i_size & ~PAGE_MASK;
-		if (size)
-			goto read_partial_attrdef_page;
-	}
-	vol->attrdef_size = i_size;
-	ntfs_debug("Read %llu bytes from $AttrDef.", i_size);
-	iput(ino);
-	return true;
-free_iput_failed:
-	ntfs_free(vol->attrdef);
-	vol->attrdef = NULL;
-iput_failed:
-	iput(ino);
-failed:
-	ntfs_error(sb, "Failed to initialize attribute definition table.");
-	return false;
-}
-
-#endif /* NTFS_RW */
-
-/**
- * load_and_init_upcase - load the upcase table for an ntfs volume
- * @vol:	ntfs super block describing device whose upcase to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_upcase(ntfs_volume *vol)
-{
-	loff_t i_size;
-	struct super_block *sb = vol->sb;
-	struct inode *ino;
-	struct page *page;
-	pgoff_t index, max_index;
-	unsigned int size;
-	int i, max;
-
-	ntfs_debug("Entering.");
-	/* Read upcase table and setup vol->upcase and vol->upcase_len. */
-	ino = ntfs_iget(sb, FILE_UpCase);
-	if (IS_ERR(ino) || is_bad_inode(ino)) {
-		if (!IS_ERR(ino))
-			iput(ino);
-		goto upcase_failed;
-	}
-	/*
-	 * The upcase size must not be above 64k Unicode characters, must not
-	 * be zero and must be a multiple of sizeof(ntfschar).
-	 */
-	i_size = i_size_read(ino);
-	if (!i_size || i_size & (sizeof(ntfschar) - 1) ||
-			i_size > 64ULL * 1024 * sizeof(ntfschar))
-		goto iput_upcase_failed;
-	vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size);
-	if (!vol->upcase)
-		goto iput_upcase_failed;
-	index = 0;
-	max_index = i_size >> PAGE_SHIFT;
-	size = PAGE_SIZE;
-	while (index < max_index) {
-		/* Read the upcase table and copy it into the linear buffer. */
-read_partial_upcase_page:
-		page = ntfs_map_page(ino->i_mapping, index);
-		if (IS_ERR(page))
-			goto iput_upcase_failed;
-		memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT),
-				page_address(page), size);
-		ntfs_unmap_page(page);
-	}
-	if (size == PAGE_SIZE) {
-		size = i_size & ~PAGE_MASK;
-		if (size)
-			goto read_partial_upcase_page;
-	}
-	vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS;
-	ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
-			i_size, 64 * 1024 * sizeof(ntfschar));
-	iput(ino);
-	mutex_lock(&ntfs_lock);
-	if (!default_upcase) {
-		ntfs_debug("Using volume specified $UpCase since default is "
-				"not present.");
-		mutex_unlock(&ntfs_lock);
-		return true;
-	}
-	max = default_upcase_len;
-	if (max > vol->upcase_len)
-		max = vol->upcase_len;
-	for (i = 0; i < max; i++)
-		if (vol->upcase[i] != default_upcase[i])
-			break;
-	if (i == max) {
-		ntfs_free(vol->upcase);
-		vol->upcase = default_upcase;
-		vol->upcase_len = max;
-		ntfs_nr_upcase_users++;
-		mutex_unlock(&ntfs_lock);
-		ntfs_debug("Volume specified $UpCase matches default. Using "
-				"default.");
-		return true;
-	}
-	mutex_unlock(&ntfs_lock);
-	ntfs_debug("Using volume specified $UpCase since it does not match "
-			"the default.");
-	return true;
-iput_upcase_failed:
-	iput(ino);
-	ntfs_free(vol->upcase);
-	vol->upcase = NULL;
-upcase_failed:
-	mutex_lock(&ntfs_lock);
-	if (default_upcase) {
-		vol->upcase = default_upcase;
-		vol->upcase_len = default_upcase_len;
-		ntfs_nr_upcase_users++;
-		mutex_unlock(&ntfs_lock);
-		ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
-				"default.");
-		return true;
-	}
-	mutex_unlock(&ntfs_lock);
-	ntfs_error(sb, "Failed to initialize upcase table.");
-	return false;
-}
-
-/*
- * The lcn and mft bitmap inodes are NTFS-internal inodes with
- * their own special locking rules:
- */
-static struct lock_class_key
-	lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key,
-	mftbmp_runlist_lock_key, mftbmp_mrec_lock_key;
-
-/**
- * load_system_files - open the system files using normal functions
- * @vol:	ntfs super block describing device whose system files to load
- *
- * Open the system files with normal access functions and complete setting up
- * the ntfs super block @vol.
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_system_files(ntfs_volume *vol)
-{
-	struct super_block *sb = vol->sb;
-	MFT_RECORD *m;
-	VOLUME_INFORMATION *vi;
-	ntfs_attr_search_ctx *ctx;
-#ifdef NTFS_RW
-	RESTART_PAGE_HEADER *rp;
-	int err;
-#endif /* NTFS_RW */
-
-	ntfs_debug("Entering.");
-#ifdef NTFS_RW
-	/* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */
-	if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) {
-		static const char *es1 = "Failed to load $MFTMirr";
-		static const char *es2 = "$MFTMirr does not match $MFT";
-		static const char *es3 = ".  Run ntfsfix and/or chkdsk.";
-
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						!vol->mftmirr_ino ? es1 : es2,
-						es3);
-				goto iput_mirr_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s",
-					!vol->mftmirr_ino ? es1 : es2, es3);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s",
-					!vol->mftmirr_ino ? es1 : es2, es3);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-#endif /* NTFS_RW */
-	/* Get mft bitmap attribute inode. */
-	vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0);
-	if (IS_ERR(vol->mftbmp_ino)) {
-		ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
-		goto iput_mirr_err_out;
-	}
-	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock,
-			   &mftbmp_runlist_lock_key);
-	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock,
-			   &mftbmp_mrec_lock_key);
-	/* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
-	if (!load_and_init_upcase(vol))
-		goto iput_mftbmp_err_out;
-#ifdef NTFS_RW
-	/*
-	 * Read attribute definitions table and setup @vol->attrdef and
-	 * @vol->attrdef_size.
-	 */
-	if (!load_and_init_attrdef(vol))
-		goto iput_upcase_err_out;
-#endif /* NTFS_RW */
-	/*
-	 * Get the cluster allocation bitmap inode and verify the size, no
-	 * need for any locking at this stage as we are already running
-	 * exclusively as we are mount in progress task.
-	 */
-	vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap);
-	if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) {
-		if (!IS_ERR(vol->lcnbmp_ino))
-			iput(vol->lcnbmp_ino);
-		goto bitmap_failed;
-	}
-	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock,
-			   &lcnbmp_runlist_lock_key);
-	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock,
-			   &lcnbmp_mrec_lock_key);
-
-	NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino));
-	if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) {
-		iput(vol->lcnbmp_ino);
-bitmap_failed:
-		ntfs_error(sb, "Failed to load $Bitmap.");
-		goto iput_attrdef_err_out;
-	}
-	/*
-	 * Get the volume inode and setup our cache of the volume flags and
-	 * version.
-	 */
-	vol->vol_ino = ntfs_iget(sb, FILE_Volume);
-	if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) {
-		if (!IS_ERR(vol->vol_ino))
-			iput(vol->vol_ino);
-volume_failed:
-		ntfs_error(sb, "Failed to load $Volume.");
-		goto iput_lcnbmp_err_out;
-	}
-	m = map_mft_record(NTFS_I(vol->vol_ino));
-	if (IS_ERR(m)) {
-iput_volume_failed:
-		iput(vol->vol_ino);
-		goto volume_failed;
-	}
-	if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) {
-		ntfs_error(sb, "Failed to get attribute search context.");
-		goto get_ctx_vol_failed;
-	}
-	if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
-			ctx) || ctx->attr->non_resident || ctx->attr->flags) {
-err_put_vol:
-		ntfs_attr_put_search_ctx(ctx);
-get_ctx_vol_failed:
-		unmap_mft_record(NTFS_I(vol->vol_ino));
-		goto iput_volume_failed;
-	}
-	vi = (VOLUME_INFORMATION*)((char*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	/* Some bounds checks. */
-	if ((u8*)vi < (u8*)ctx->attr || (u8*)vi +
-			le32_to_cpu(ctx->attr->data.resident.value_length) >
-			(u8*)ctx->attr + le32_to_cpu(ctx->attr->length))
-		goto err_put_vol;
-	/* Copy the volume flags and version to the ntfs_volume structure. */
-	vol->vol_flags = vi->flags;
-	vol->major_ver = vi->major_ver;
-	vol->minor_ver = vi->minor_ver;
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(NTFS_I(vol->vol_ino));
-	pr_info("volume version %i.%i.\n", vol->major_ver,
-			vol->minor_ver);
-	if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
-		ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
-				"volume version %i.%i (need at least version "
-				"3.0).", vol->major_ver, vol->minor_ver);
-		NVolClearSparseEnabled(vol);
-	}
-#ifdef NTFS_RW
-	/* Make sure that no unsupported volume flags are set. */
-	if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
-		static const char *es1a = "Volume is dirty";
-		static const char *es1b = "Volume has been modified by chkdsk";
-		static const char *es1c = "Volume has unsupported flags set";
-		static const char *es2a = ".  Run chkdsk and mount in Windows.";
-		static const char *es2b = ".  Mount in Windows.";
-		const char *es1, *es2;
-
-		es2 = es2a;
-		if (vol->vol_flags & VOLUME_IS_DIRTY)
-			es1 = es1a;
-		else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
-			es1 = es1b;
-			es2 = es2b;
-		} else {
-			es1 = es1c;
-			ntfs_warning(sb, "Unsupported volume flags 0x%x "
-					"encountered.",
-					(unsigned)le16_to_cpu(vol->vol_flags));
-		}
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				goto iput_vol_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/*
-		 * Do not set NVolErrors() because ntfs_remount() re-checks the
-		 * flags which we need to do in case any flags have changed.
-		 */
-	}
-	/*
-	 * Get the inode for the logfile, check it and determine if the volume
-	 * was shutdown cleanly.
-	 */
-	rp = NULL;
-	if (!load_and_check_logfile(vol, &rp) ||
-			!ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
-		static const char *es1a = "Failed to load $LogFile";
-		static const char *es1b = "$LogFile is not clean";
-		static const char *es2 = ".  Mount in Windows.";
-		const char *es1;
-
-		es1 = !vol->logfile_ino ? es1a : es1b;
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				if (vol->logfile_ino) {
-					BUG_ON(!rp);
-					ntfs_free(rp);
-				}
-				goto iput_logfile_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-	ntfs_free(rp);
-#endif /* NTFS_RW */
-	/* Get the root directory inode so we can do path lookups. */
-	vol->root_ino = ntfs_iget(sb, FILE_root);
-	if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
-		if (!IS_ERR(vol->root_ino))
-			iput(vol->root_ino);
-		ntfs_error(sb, "Failed to load root directory.");
-		goto iput_logfile_err_out;
-	}
-#ifdef NTFS_RW
-	/*
-	 * Check if Windows is suspended to disk on the target volume.  If it
-	 * is hibernated, we must not write *anything* to the disk so set
-	 * NVolErrors() without setting the dirty volume flag and mount
-	 * read-only.  This will prevent read-write remounting and it will also
-	 * prevent all writes.
-	 */
-	err = check_windows_hibernation_status(vol);
-	if (unlikely(err)) {
-		static const char *es1a = "Failed to determine if Windows is "
-				"hibernated";
-		static const char *es1b = "Windows is hibernated";
-		static const char *es2 = ".  Run chkdsk.";
-		const char *es1;
-
-		es1 = err < 0 ? es1a : es1b;
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				goto iput_root_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-	/* If (still) a read-write mount, mark the volume dirty. */
-	if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
-		static const char *es1 = "Failed to set dirty bit in volume "
-				"information flags";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_root_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		/*
-		 * Do not set NVolErrors() because ntfs_remount() might manage
-		 * to set the dirty flag in which case all would be well.
-		 */
-	}
-#if 0
-	// TODO: Enable this code once we start modifying anything that is
-	//	 different between NTFS 1.2 and 3.x...
-	/*
-	 * If (still) a read-write mount, set the NT4 compatibility flag on
-	 * newer NTFS version volumes.
-	 */
-	if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) &&
-			ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
-		static const char *es1 = "Failed to set NT4 compatibility flag";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_root_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		NVolSetErrors(vol);
-	}
-#endif
-	/* If (still) a read-write mount, empty the logfile. */
-	if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) {
-		static const char *es1 = "Failed to empty $LogFile";
-		static const char *es2 = ".  Mount in Windows.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_root_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		NVolSetErrors(vol);
-	}
-#endif /* NTFS_RW */
-	/* If on NTFS versions before 3.0, we are done. */
-	if (unlikely(vol->major_ver < 3))
-		return true;
-	/* NTFS 3.0+ specific initialization. */
-	/* Get the security descriptors inode. */
-	vol->secure_ino = ntfs_iget(sb, FILE_Secure);
-	if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) {
-		if (!IS_ERR(vol->secure_ino))
-			iput(vol->secure_ino);
-		ntfs_error(sb, "Failed to load $Secure.");
-		goto iput_root_err_out;
-	}
-	// TODO: Initialize security.
-	/* Get the extended system files' directory inode. */
-	vol->extend_ino = ntfs_iget(sb, FILE_Extend);
-	if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
-	    !S_ISDIR(vol->extend_ino->i_mode)) {
-		if (!IS_ERR(vol->extend_ino))
-			iput(vol->extend_ino);
-		ntfs_error(sb, "Failed to load $Extend.");
-		goto iput_sec_err_out;
-	}
-#ifdef NTFS_RW
-	/* Find the quota file, load it if present, and set it up. */
-	if (!load_and_init_quota(vol)) {
-		static const char *es1 = "Failed to load $Quota";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				goto iput_quota_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-	/* If (still) a read-write mount, mark the quotas out of date. */
-	if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) {
-		static const char *es1 = "Failed to mark quotas out of date";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_quota_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		NVolSetErrors(vol);
-	}
-	/*
-	 * Find the transaction log file ($UsnJrnl), load it if present, check
-	 * it, and set it up.
-	 */
-	if (!load_and_init_usnjrnl(vol)) {
-		static const char *es1 = "Failed to load $UsnJrnl";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				goto iput_usnjrnl_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-	/* If (still) a read-write mount, stamp the transaction log. */
-	if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) {
-		static const char *es1 = "Failed to stamp transaction log "
-				"($UsnJrnl)";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_usnjrnl_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		NVolSetErrors(vol);
-	}
-#endif /* NTFS_RW */
-	return true;
-#ifdef NTFS_RW
-iput_usnjrnl_err_out:
-	iput(vol->usnjrnl_j_ino);
-	iput(vol->usnjrnl_max_ino);
-	iput(vol->usnjrnl_ino);
-iput_quota_err_out:
-	iput(vol->quota_q_ino);
-	iput(vol->quota_ino);
-	iput(vol->extend_ino);
-#endif /* NTFS_RW */
-iput_sec_err_out:
-	iput(vol->secure_ino);
-iput_root_err_out:
-	iput(vol->root_ino);
-iput_logfile_err_out:
-#ifdef NTFS_RW
-	iput(vol->logfile_ino);
-iput_vol_err_out:
-#endif /* NTFS_RW */
-	iput(vol->vol_ino);
-iput_lcnbmp_err_out:
-	iput(vol->lcnbmp_ino);
-iput_attrdef_err_out:
-	vol->attrdef_size = 0;
-	if (vol->attrdef) {
-		ntfs_free(vol->attrdef);
-		vol->attrdef = NULL;
-	}
-#ifdef NTFS_RW
-iput_upcase_err_out:
-#endif /* NTFS_RW */
-	vol->upcase_len = 0;
-	mutex_lock(&ntfs_lock);
-	if (vol->upcase == default_upcase) {
-		ntfs_nr_upcase_users--;
-		vol->upcase = NULL;
-	}
-	mutex_unlock(&ntfs_lock);
-	if (vol->upcase) {
-		ntfs_free(vol->upcase);
-		vol->upcase = NULL;
-	}
-iput_mftbmp_err_out:
-	iput(vol->mftbmp_ino);
-iput_mirr_err_out:
-#ifdef NTFS_RW
-	iput(vol->mftmirr_ino);
-#endif /* NTFS_RW */
-	return false;
-}
-
-/**
- * ntfs_put_super - called by the vfs to unmount a volume
- * @sb:		vfs superblock of volume to unmount
- *
- * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when
- * the volume is being unmounted (umount system call has been invoked) and it
- * releases all inodes and memory belonging to the NTFS specific part of the
- * super block.
- */
-static void ntfs_put_super(struct super_block *sb)
-{
-	ntfs_volume *vol = NTFS_SB(sb);
-
-	ntfs_debug("Entering.");
-
-#ifdef NTFS_RW
-	/*
-	 * Commit all inodes while they are still open in case some of them
-	 * cause others to be dirtied.
-	 */
-	ntfs_commit_inode(vol->vol_ino);
-
-	/* NTFS 3.0+ specific. */
-	if (vol->major_ver >= 3) {
-		if (vol->usnjrnl_j_ino)
-			ntfs_commit_inode(vol->usnjrnl_j_ino);
-		if (vol->usnjrnl_max_ino)
-			ntfs_commit_inode(vol->usnjrnl_max_ino);
-		if (vol->usnjrnl_ino)
-			ntfs_commit_inode(vol->usnjrnl_ino);
-		if (vol->quota_q_ino)
-			ntfs_commit_inode(vol->quota_q_ino);
-		if (vol->quota_ino)
-			ntfs_commit_inode(vol->quota_ino);
-		if (vol->extend_ino)
-			ntfs_commit_inode(vol->extend_ino);
-		if (vol->secure_ino)
-			ntfs_commit_inode(vol->secure_ino);
-	}
-
-	ntfs_commit_inode(vol->root_ino);
-
-	down_write(&vol->lcnbmp_lock);
-	ntfs_commit_inode(vol->lcnbmp_ino);
-	up_write(&vol->lcnbmp_lock);
-
-	down_write(&vol->mftbmp_lock);
-	ntfs_commit_inode(vol->mftbmp_ino);
-	up_write(&vol->mftbmp_lock);
-
-	if (vol->logfile_ino)
-		ntfs_commit_inode(vol->logfile_ino);
-
-	if (vol->mftmirr_ino)
-		ntfs_commit_inode(vol->mftmirr_ino);
-	ntfs_commit_inode(vol->mft_ino);
-
-	/*
-	 * If a read-write mount and no volume errors have occurred, mark the
-	 * volume clean.  Also, re-commit all affected inodes.
-	 */
-	if (!sb_rdonly(sb)) {
-		if (!NVolErrors(vol)) {
-			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
-				ntfs_warning(sb, "Failed to clear dirty bit "
-						"in volume information "
-						"flags.  Run chkdsk.");
-			ntfs_commit_inode(vol->vol_ino);
-			ntfs_commit_inode(vol->root_ino);
-			if (vol->mftmirr_ino)
-				ntfs_commit_inode(vol->mftmirr_ino);
-			ntfs_commit_inode(vol->mft_ino);
-		} else {
-			ntfs_warning(sb, "Volume has errors.  Leaving volume "
-					"marked dirty.  Run chkdsk.");
-		}
-	}
-#endif /* NTFS_RW */
-
-	iput(vol->vol_ino);
-	vol->vol_ino = NULL;
-
-	/* NTFS 3.0+ specific clean up. */
-	if (vol->major_ver >= 3) {
-#ifdef NTFS_RW
-		if (vol->usnjrnl_j_ino) {
-			iput(vol->usnjrnl_j_ino);
-			vol->usnjrnl_j_ino = NULL;
-		}
-		if (vol->usnjrnl_max_ino) {
-			iput(vol->usnjrnl_max_ino);
-			vol->usnjrnl_max_ino = NULL;
-		}
-		if (vol->usnjrnl_ino) {
-			iput(vol->usnjrnl_ino);
-			vol->usnjrnl_ino = NULL;
-		}
-		if (vol->quota_q_ino) {
-			iput(vol->quota_q_ino);
-			vol->quota_q_ino = NULL;
-		}
-		if (vol->quota_ino) {
-			iput(vol->quota_ino);
-			vol->quota_ino = NULL;
-		}
-#endif /* NTFS_RW */
-		if (vol->extend_ino) {
-			iput(vol->extend_ino);
-			vol->extend_ino = NULL;
-		}
-		if (vol->secure_ino) {
-			iput(vol->secure_ino);
-			vol->secure_ino = NULL;
-		}
-	}
-
-	iput(vol->root_ino);
-	vol->root_ino = NULL;
-
-	down_write(&vol->lcnbmp_lock);
-	iput(vol->lcnbmp_ino);
-	vol->lcnbmp_ino = NULL;
-	up_write(&vol->lcnbmp_lock);
-
-	down_write(&vol->mftbmp_lock);
-	iput(vol->mftbmp_ino);
-	vol->mftbmp_ino = NULL;
-	up_write(&vol->mftbmp_lock);
-
-#ifdef NTFS_RW
-	if (vol->logfile_ino) {
-		iput(vol->logfile_ino);
-		vol->logfile_ino = NULL;
-	}
-	if (vol->mftmirr_ino) {
-		/* Re-commit the mft mirror and mft just in case. */
-		ntfs_commit_inode(vol->mftmirr_ino);
-		ntfs_commit_inode(vol->mft_ino);
-		iput(vol->mftmirr_ino);
-		vol->mftmirr_ino = NULL;
-	}
-	/*
-	 * We should have no dirty inodes left, due to
-	 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
-	 * the underlying mft records are written out and cleaned.
-	 */
-	ntfs_commit_inode(vol->mft_ino);
-	write_inode_now(vol->mft_ino, 1);
-#endif /* NTFS_RW */
-
-	iput(vol->mft_ino);
-	vol->mft_ino = NULL;
-
-	/* Throw away the table of attribute definitions. */
-	vol->attrdef_size = 0;
-	if (vol->attrdef) {
-		ntfs_free(vol->attrdef);
-		vol->attrdef = NULL;
-	}
-	vol->upcase_len = 0;
-	/*
-	 * Destroy the global default upcase table if necessary.  Also decrease
-	 * the number of upcase users if we are a user.
-	 */
-	mutex_lock(&ntfs_lock);
-	if (vol->upcase == default_upcase) {
-		ntfs_nr_upcase_users--;
-		vol->upcase = NULL;
-	}
-	if (!ntfs_nr_upcase_users && default_upcase) {
-		ntfs_free(default_upcase);
-		default_upcase = NULL;
-	}
-	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
-		free_compression_buffers();
-	mutex_unlock(&ntfs_lock);
-	if (vol->upcase) {
-		ntfs_free(vol->upcase);
-		vol->upcase = NULL;
-	}
-
-	unload_nls(vol->nls_map);
-
-	sb->s_fs_info = NULL;
-	kfree(vol);
-}
-
-/**
- * get_nr_free_clusters - return the number of free clusters on a volume
- * @vol:	ntfs volume for which to obtain free cluster count
- *
- * Calculate the number of free clusters on the mounted NTFS volume @vol. We
- * actually calculate the number of clusters in use instead because this
- * allows us to not care about partial pages as these will be just zero filled
- * and hence not be counted as allocated clusters.
- *
- * The only particularity is that clusters beyond the end of the logical ntfs
- * volume will be marked as allocated to prevent errors which means we have to
- * discount those at the end. This is important as the cluster bitmap always
- * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside
- * the logical volume and marked in use when they are not as they do not exist.
- *
- * If any pages cannot be read we assume all clusters in the erroring pages are
- * in use. This means we return an underestimate on errors which is better than
- * an overestimate.
- */
-static s64 get_nr_free_clusters(ntfs_volume *vol)
-{
-	s64 nr_free = vol->nr_clusters;
-	struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
-	struct page *page;
-	pgoff_t index, max_index;
-
-	ntfs_debug("Entering.");
-	/* Serialize accesses to the cluster bitmap. */
-	down_read(&vol->lcnbmp_lock);
-	/*
-	 * Convert the number of bits into bytes rounded up, then convert into
-	 * multiples of PAGE_SIZE, rounding up so that if we have one
-	 * full and one partial page max_index = 2.
-	 */
-	max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >>
-			PAGE_SHIFT;
-	/* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
-	ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
-			max_index, PAGE_SIZE / 4);
-	for (index = 0; index < max_index; index++) {
-		unsigned long *kaddr;
-
-		/*
-		 * Read the page from page cache, getting it from backing store
-		 * if necessary, and increment the use count.
-		 */
-		page = read_mapping_page(mapping, index, NULL);
-		/* Ignore pages which errored synchronously. */
-		if (IS_ERR(page)) {
-			ntfs_debug("read_mapping_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			nr_free -= PAGE_SIZE * 8;
-			continue;
-		}
-		kaddr = kmap_atomic(page);
-		/*
-		 * Subtract the number of set bits. If this
-		 * is the last page and it is partial we don't really care as
-		 * it just means we do a little extra work but it won't affect
-		 * the result as all out of range bytes are set to zero by
-		 * ntfs_readpage().
-		 */
-		nr_free -= bitmap_weight(kaddr,
-					PAGE_SIZE * BITS_PER_BYTE);
-		kunmap_atomic(kaddr);
-		put_page(page);
-	}
-	ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
-	/*
-	 * Fixup for eventual bits outside logical ntfs volume (see function
-	 * description above).
-	 */
-	if (vol->nr_clusters & 63)
-		nr_free += 64 - (vol->nr_clusters & 63);
-	up_read(&vol->lcnbmp_lock);
-	/* If errors occurred we may well have gone below zero, fix this. */
-	if (nr_free < 0)
-		nr_free = 0;
-	ntfs_debug("Exiting.");
-	return nr_free;
-}
-
-/**
- * __get_nr_free_mft_records - return the number of free inodes on a volume
- * @vol:	ntfs volume for which to obtain free inode count
- * @nr_free:	number of mft records in filesystem
- * @max_index:	maximum number of pages containing set bits
- *
- * Calculate the number of free mft records (inodes) on the mounted NTFS
- * volume @vol. We actually calculate the number of mft records in use instead
- * because this allows us to not care about partial pages as these will be just
- * zero filled and hence not be counted as allocated mft record.
- *
- * If any pages cannot be read we assume all mft records in the erroring pages
- * are in use. This means we return an underestimate on errors which is better
- * than an overestimate.
- *
- * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
- */
-static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
-		s64 nr_free, const pgoff_t max_index)
-{
-	struct address_space *mapping = vol->mftbmp_ino->i_mapping;
-	struct page *page;
-	pgoff_t index;
-
-	ntfs_debug("Entering.");
-	/* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
-	ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
-			"0x%lx.", max_index, PAGE_SIZE / 4);
-	for (index = 0; index < max_index; index++) {
-		unsigned long *kaddr;
-
-		/*
-		 * Read the page from page cache, getting it from backing store
-		 * if necessary, and increment the use count.
-		 */
-		page = read_mapping_page(mapping, index, NULL);
-		/* Ignore pages which errored synchronously. */
-		if (IS_ERR(page)) {
-			ntfs_debug("read_mapping_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			nr_free -= PAGE_SIZE * 8;
-			continue;
-		}
-		kaddr = kmap_atomic(page);
-		/*
-		 * Subtract the number of set bits. If this
-		 * is the last page and it is partial we don't really care as
-		 * it just means we do a little extra work but it won't affect
-		 * the result as all out of range bytes are set to zero by
-		 * ntfs_readpage().
-		 */
-		nr_free -= bitmap_weight(kaddr,
-					PAGE_SIZE * BITS_PER_BYTE);
-		kunmap_atomic(kaddr);
-		put_page(page);
-	}
-	ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
-			index - 1);
-	/* If errors occurred we may well have gone below zero, fix this. */
-	if (nr_free < 0)
-		nr_free = 0;
-	ntfs_debug("Exiting.");
-	return nr_free;
-}
-
-/**
- * ntfs_statfs - return information about mounted NTFS volume
- * @dentry:	dentry from mounted volume
- * @sfs:	statfs structure in which to return the information
- *
- * Return information about the mounted NTFS volume @dentry in the statfs structure
- * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
- * called). We interpret the values to be correct of the moment in time at
- * which we are called. Most values are variable otherwise and this isn't just
- * the free values but the totals as well. For example we can increase the
- * total number of file nodes if we run out and we can keep doing this until
- * there is no more space on the volume left at all.
- *
- * Called from vfs_statfs which is used to handle the statfs, fstatfs, and
- * ustat system calls.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
-{
-	struct super_block *sb = dentry->d_sb;
-	s64 size;
-	ntfs_volume *vol = NTFS_SB(sb);
-	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
-	pgoff_t max_index;
-	unsigned long flags;
-
-	ntfs_debug("Entering.");
-	/* Type of filesystem. */
-	sfs->f_type   = NTFS_SB_MAGIC;
-	/* Optimal transfer block size. */
-	sfs->f_bsize  = PAGE_SIZE;
-	/*
-	 * Total data blocks in filesystem in units of f_bsize and since
-	 * inodes are also stored in data blocs ($MFT is a file) this is just
-	 * the total clusters.
-	 */
-	sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
-				PAGE_SHIFT;
-	/* Free data blocks in filesystem in units of f_bsize. */
-	size	      = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
-				PAGE_SHIFT;
-	if (size < 0LL)
-		size = 0LL;
-	/* Free blocks avail to non-superuser, same as above on NTFS. */
-	sfs->f_bavail = sfs->f_bfree = size;
-	/* Serialize accesses to the inode bitmap. */
-	down_read(&vol->mftbmp_lock);
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
-	/*
-	 * Convert the maximum number of set bits into bytes rounded up, then
-	 * convert into multiples of PAGE_SIZE, rounding up so that if we
-	 * have one full and one partial page max_index = 2.
-	 */
-	max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
-			+ 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	/* Number of inodes in filesystem (at this point in time). */
-	sfs->f_files = size;
-	/* Free inodes in fs (based on current total count). */
-	sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index);
-	up_read(&vol->mftbmp_lock);
-	/*
-	 * File system id. This is extremely *nix flavour dependent and even
-	 * within Linux itself all fs do their own thing. I interpret this to
-	 * mean a unique id associated with the mounted fs and not the id
-	 * associated with the filesystem driver, the latter is already given
-	 * by the filesystem type in sfs->f_type. Thus we use the 64-bit
-	 * volume serial number splitting it into two 32-bit parts. We enter
-	 * the least significant 32-bits in f_fsid[0] and the most significant
-	 * 32-bits in f_fsid[1].
-	 */
-	sfs->f_fsid = u64_to_fsid(vol->serial_no);
-	/* Maximum length of filenames. */
-	sfs->f_namelen	   = NTFS_MAX_NAME_LEN;
-	return 0;
-}
-
-#ifdef NTFS_RW
-static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
-{
-	return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
-}
-#endif
-
-/*
- * The complete super operations.
- */
-static const struct super_operations ntfs_sops = {
-	.alloc_inode	= ntfs_alloc_big_inode,	  /* VFS: Allocate new inode. */
-	.free_inode	= ntfs_free_big_inode, /* VFS: Deallocate inode. */
-#ifdef NTFS_RW
-	.write_inode	= ntfs_write_inode,	/* VFS: Write dirty inode to
-						   disk. */
-#endif /* NTFS_RW */
-	.put_super	= ntfs_put_super,	/* Syscall: umount. */
-	.statfs		= ntfs_statfs,		/* Syscall: statfs */
-	.remount_fs	= ntfs_remount,		/* Syscall: mount -o remount. */
-	.evict_inode	= ntfs_evict_big_inode,	/* VFS: Called when an inode is
-						   removed from memory. */
-	.show_options	= ntfs_show_options,	/* Show mount options in
-						   proc. */
-};
-
-/**
- * ntfs_fill_super - mount an ntfs filesystem
- * @sb:		super block of ntfs filesystem to mount
- * @opt:	string containing the mount options
- * @silent:	silence error output
- *
- * ntfs_fill_super() is called by the VFS to mount the device described by @sb
- * with the mount otions in @data with the NTFS filesystem.
- *
- * If @silent is true, remain silent even if errors are detected. This is used
- * during bootup, when the kernel tries to mount the root filesystem with all
- * registered filesystems one after the other until one succeeds. This implies
- * that all filesystems except the correct one will quite correctly and
- * expectedly return an error, but nobody wants to see error messages when in
- * fact this is what is supposed to happen.
- *
- * NOTE: @sb->s_flags contains the mount options flags.
- */
-static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
-{
-	ntfs_volume *vol;
-	struct buffer_head *bh;
-	struct inode *tmp_ino;
-	int blocksize, result;
-
-	/*
-	 * We do a pretty difficult piece of bootstrap by reading the
-	 * MFT (and other metadata) from disk into memory. We'll only
-	 * release this metadata during umount, so the locking patterns
-	 * observed during bootstrap do not count. So turn off the
-	 * observation of locking patterns (strictly for this context
-	 * only) while mounting NTFS. [The validator is still active
-	 * otherwise, even for this context: it will for example record
-	 * lock class registrations.]
-	 */
-	lockdep_off();
-	ntfs_debug("Entering.");
-#ifndef NTFS_RW
-	sb->s_flags |= SB_RDONLY;
-#endif /* ! NTFS_RW */
-	/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
-	sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
-	vol = NTFS_SB(sb);
-	if (!vol) {
-		if (!silent)
-			ntfs_error(sb, "Allocation of NTFS volume structure "
-					"failed. Aborting mount...");
-		lockdep_on();
-		return -ENOMEM;
-	}
-	/* Initialize ntfs_volume structure. */
-	*vol = (ntfs_volume) {
-		.sb = sb,
-		/*
-		 * Default is group and other don't have any access to files or
-		 * directories while owner has full access. Further, files by
-		 * default are not executable but directories are of course
-		 * browseable.
-		 */
-		.fmask = 0177,
-		.dmask = 0077,
-	};
-	init_rwsem(&vol->mftbmp_lock);
-	init_rwsem(&vol->lcnbmp_lock);
-
-	/* By default, enable sparse support. */
-	NVolSetSparseEnabled(vol);
-
-	/* Important to get the mount options dealt with now. */
-	if (!parse_options(vol, (char*)opt))
-		goto err_out_now;
-
-	/* We support sector sizes up to the PAGE_SIZE. */
-	if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) {
-		if (!silent)
-			ntfs_error(sb, "Device has unsupported sector size "
-					"(%i).  The maximum supported sector "
-					"size on this architecture is %lu "
-					"bytes.",
-					bdev_logical_block_size(sb->s_bdev),
-					PAGE_SIZE);
-		goto err_out_now;
-	}
-	/*
-	 * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
-	 * sector size, whichever is bigger.
-	 */
-	blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
-	if (blocksize < NTFS_BLOCK_SIZE) {
-		if (!silent)
-			ntfs_error(sb, "Unable to set device block size.");
-		goto err_out_now;
-	}
-	BUG_ON(blocksize != sb->s_blocksize);
-	ntfs_debug("Set device block size to %i bytes (block size bits %i).",
-			blocksize, sb->s_blocksize_bits);
-	/* Determine the size of the device in units of block_size bytes. */
-	vol->nr_blocks = sb_bdev_nr_blocks(sb);
-	if (!vol->nr_blocks) {
-		if (!silent)
-			ntfs_error(sb, "Unable to determine device size.");
-		goto err_out_now;
-	}
-	/* Read the boot sector and return unlocked buffer head to it. */
-	if (!(bh = read_ntfs_boot_sector(sb, silent))) {
-		if (!silent)
-			ntfs_error(sb, "Not an NTFS volume.");
-		goto err_out_now;
-	}
-	/*
-	 * Extract the data from the boot sector and setup the ntfs volume
-	 * using it.
-	 */
-	result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
-	brelse(bh);
-	if (!result) {
-		if (!silent)
-			ntfs_error(sb, "Unsupported NTFS filesystem.");
-		goto err_out_now;
-	}
-	/*
-	 * If the boot sector indicates a sector size bigger than the current
-	 * device block size, switch the device block size to the sector size.
-	 * TODO: It may be possible to support this case even when the set
-	 * below fails, we would just be breaking up the i/o for each sector
-	 * into multiple blocks for i/o purposes but otherwise it should just
-	 * work.  However it is safer to leave disabled until someone hits this
-	 * error message and then we can get them to try it without the setting
-	 * so we know for sure that it works.
-	 */
-	if (vol->sector_size > blocksize) {
-		blocksize = sb_set_blocksize(sb, vol->sector_size);
-		if (blocksize != vol->sector_size) {
-			if (!silent)
-				ntfs_error(sb, "Unable to set device block "
-						"size to sector size (%i).",
-						vol->sector_size);
-			goto err_out_now;
-		}
-		BUG_ON(blocksize != sb->s_blocksize);
-		vol->nr_blocks = sb_bdev_nr_blocks(sb);
-		ntfs_debug("Changed device block size to %i bytes (block size "
-				"bits %i) to match volume sector size.",
-				blocksize, sb->s_blocksize_bits);
-	}
-	/* Initialize the cluster and mft allocators. */
-	ntfs_setup_allocators(vol);
-	/* Setup remaining fields in the super block. */
-	sb->s_magic = NTFS_SB_MAGIC;
-	/*
-	 * Ntfs allows 63 bits for the file size, i.e. correct would be:
-	 *	sb->s_maxbytes = ~0ULL >> 1;
-	 * But the kernel uses a long as the page cache page index which on
-	 * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel
-	 * defined to the maximum the page cache page index can cope with
-	 * without overflowing the index or to 2^63 - 1, whichever is smaller.
-	 */
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	/* Ntfs measures time in 100ns intervals. */
-	sb->s_time_gran = 100;
-	/*
-	 * Now load the metadata required for the page cache and our address
-	 * space operations to function. We do this by setting up a specialised
-	 * read_inode method and then just calling the normal iget() to obtain
-	 * the inode for $MFT which is sufficient to allow our normal inode
-	 * operations and associated address space operations to function.
-	 */
-	sb->s_op = &ntfs_sops;
-	tmp_ino = new_inode(sb);
-	if (!tmp_ino) {
-		if (!silent)
-			ntfs_error(sb, "Failed to load essential metadata.");
-		goto err_out_now;
-	}
-	tmp_ino->i_ino = FILE_MFT;
-	insert_inode_hash(tmp_ino);
-	if (ntfs_read_inode_mount(tmp_ino) < 0) {
-		if (!silent)
-			ntfs_error(sb, "Failed to load essential metadata.");
-		goto iput_tmp_ino_err_out_now;
-	}
-	mutex_lock(&ntfs_lock);
-	/*
-	 * The current mount is a compression user if the cluster size is
-	 * less than or equal 4kiB.
-	 */
-	if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) {
-		result = allocate_compression_buffers();
-		if (result) {
-			ntfs_error(NULL, "Failed to allocate buffers "
-					"for compression engine.");
-			ntfs_nr_compression_users--;
-			mutex_unlock(&ntfs_lock);
-			goto iput_tmp_ino_err_out_now;
-		}
-	}
-	/*
-	 * Generate the global default upcase table if necessary.  Also
-	 * temporarily increment the number of upcase users to avoid race
-	 * conditions with concurrent (u)mounts.
-	 */
-	if (!default_upcase)
-		default_upcase = generate_default_upcase();
-	ntfs_nr_upcase_users++;
-	mutex_unlock(&ntfs_lock);
-	/*
-	 * From now on, ignore @silent parameter. If we fail below this line,
-	 * it will be due to a corrupt fs or a system error, so we report it.
-	 */
-	/*
-	 * Open the system files with normal access functions and complete
-	 * setting up the ntfs super block.
-	 */
-	if (!load_system_files(vol)) {
-		ntfs_error(sb, "Failed to load system files.");
-		goto unl_upcase_iput_tmp_ino_err_out_now;
-	}
-
-	/* We grab a reference, simulating an ntfs_iget(). */
-	ihold(vol->root_ino);
-	if ((sb->s_root = d_make_root(vol->root_ino))) {
-		ntfs_debug("Exiting, status successful.");
-		/* Release the default upcase if it has no users. */
-		mutex_lock(&ntfs_lock);
-		if (!--ntfs_nr_upcase_users && default_upcase) {
-			ntfs_free(default_upcase);
-			default_upcase = NULL;
-		}
-		mutex_unlock(&ntfs_lock);
-		sb->s_export_op = &ntfs_export_ops;
-		lockdep_on();
-		return 0;
-	}
-	ntfs_error(sb, "Failed to allocate root directory.");
-	/* Clean up after the successful load_system_files() call from above. */
-	// TODO: Use ntfs_put_super() instead of repeating all this code...
-	// FIXME: Should mark the volume clean as the error is most likely
-	// 	  -ENOMEM.
-	iput(vol->vol_ino);
-	vol->vol_ino = NULL;
-	/* NTFS 3.0+ specific clean up. */
-	if (vol->major_ver >= 3) {
-#ifdef NTFS_RW
-		if (vol->usnjrnl_j_ino) {
-			iput(vol->usnjrnl_j_ino);
-			vol->usnjrnl_j_ino = NULL;
-		}
-		if (vol->usnjrnl_max_ino) {
-			iput(vol->usnjrnl_max_ino);
-			vol->usnjrnl_max_ino = NULL;
-		}
-		if (vol->usnjrnl_ino) {
-			iput(vol->usnjrnl_ino);
-			vol->usnjrnl_ino = NULL;
-		}
-		if (vol->quota_q_ino) {
-			iput(vol->quota_q_ino);
-			vol->quota_q_ino = NULL;
-		}
-		if (vol->quota_ino) {
-			iput(vol->quota_ino);
-			vol->quota_ino = NULL;
-		}
-#endif /* NTFS_RW */
-		if (vol->extend_ino) {
-			iput(vol->extend_ino);
-			vol->extend_ino = NULL;
-		}
-		if (vol->secure_ino) {
-			iput(vol->secure_ino);
-			vol->secure_ino = NULL;
-		}
-	}
-	iput(vol->root_ino);
-	vol->root_ino = NULL;
-	iput(vol->lcnbmp_ino);
-	vol->lcnbmp_ino = NULL;
-	iput(vol->mftbmp_ino);
-	vol->mftbmp_ino = NULL;
-#ifdef NTFS_RW
-	if (vol->logfile_ino) {
-		iput(vol->logfile_ino);
-		vol->logfile_ino = NULL;
-	}
-	if (vol->mftmirr_ino) {
-		iput(vol->mftmirr_ino);
-		vol->mftmirr_ino = NULL;
-	}
-#endif /* NTFS_RW */
-	/* Throw away the table of attribute definitions. */
-	vol->attrdef_size = 0;
-	if (vol->attrdef) {
-		ntfs_free(vol->attrdef);
-		vol->attrdef = NULL;
-	}
-	vol->upcase_len = 0;
-	mutex_lock(&ntfs_lock);
-	if (vol->upcase == default_upcase) {
-		ntfs_nr_upcase_users--;
-		vol->upcase = NULL;
-	}
-	mutex_unlock(&ntfs_lock);
-	if (vol->upcase) {
-		ntfs_free(vol->upcase);
-		vol->upcase = NULL;
-	}
-	if (vol->nls_map) {
-		unload_nls(vol->nls_map);
-		vol->nls_map = NULL;
-	}
-	/* Error exit code path. */
-unl_upcase_iput_tmp_ino_err_out_now:
-	/*
-	 * Decrease the number of upcase users and destroy the global default
-	 * upcase table if necessary.
-	 */
-	mutex_lock(&ntfs_lock);
-	if (!--ntfs_nr_upcase_users && default_upcase) {
-		ntfs_free(default_upcase);
-		default_upcase = NULL;
-	}
-	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
-		free_compression_buffers();
-	mutex_unlock(&ntfs_lock);
-iput_tmp_ino_err_out_now:
-	iput(tmp_ino);
-	if (vol->mft_ino && vol->mft_ino != tmp_ino)
-		iput(vol->mft_ino);
-	vol->mft_ino = NULL;
-	/* Errors at this stage are irrelevant. */
-err_out_now:
-	sb->s_fs_info = NULL;
-	kfree(vol);
-	ntfs_debug("Failed, returning -EINVAL.");
-	lockdep_on();
-	return -EINVAL;
-}
-
-/*
- * This is a slab cache to optimize allocations and deallocations of Unicode
- * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
- * (255) Unicode characters + a terminating NULL Unicode character.
- */
-struct kmem_cache *ntfs_name_cache;
-
-/* Slab caches for efficient allocation/deallocation of inodes. */
-struct kmem_cache *ntfs_inode_cache;
-struct kmem_cache *ntfs_big_inode_cache;
-
-/* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(void *foo)
-{
-	ntfs_inode *ni = (ntfs_inode *)foo;
-
-	inode_init_once(VFS_I(ni));
-}
-
-/*
- * Slab caches to optimize allocations and deallocations of attribute search
- * contexts and index contexts, respectively.
- */
-struct kmem_cache *ntfs_attr_ctx_cache;
-struct kmem_cache *ntfs_index_ctx_cache;
-
-/* Driver wide mutex. */
-DEFINE_MUTEX(ntfs_lock);
-
-static struct dentry *ntfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
-}
-
-static struct file_system_type ntfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ntfs",
-	.mount		= ntfs_mount,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("ntfs");
-
-/* Stable names for the slab caches. */
-static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
-static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache";
-static const char ntfs_name_cache_name[] = "ntfs_name_cache";
-static const char ntfs_inode_cache_name[] = "ntfs_inode_cache";
-static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache";
-
-static int __init init_ntfs_fs(void)
-{
-	int err = 0;
-
-	/* This may be ugly but it results in pretty output so who cares. (-8 */
-	pr_info("driver " NTFS_VERSION " [Flags: R/"
-#ifdef NTFS_RW
-			"W"
-#else
-			"O"
-#endif
-#ifdef DEBUG
-			" DEBUG"
-#endif
-#ifdef MODULE
-			" MODULE"
-#endif
-			"].\n");
-
-	ntfs_debug("Debug messages are enabled.");
-
-	ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
-			sizeof(ntfs_index_context), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
-	if (!ntfs_index_ctx_cache) {
-		pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
-		goto ictx_err_out;
-	}
-	ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
-			sizeof(ntfs_attr_search_ctx), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
-	if (!ntfs_attr_ctx_cache) {
-		pr_crit("NTFS: Failed to create %s!\n",
-			ntfs_attr_ctx_cache_name);
-		goto actx_err_out;
-	}
-
-	ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
-			(NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
-			SLAB_HWCACHE_ALIGN, NULL);
-	if (!ntfs_name_cache) {
-		pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
-		goto name_err_out;
-	}
-
-	ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
-			sizeof(ntfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
-	if (!ntfs_inode_cache) {
-		pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
-		goto inode_err_out;
-	}
-
-	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
-			sizeof(big_ntfs_inode), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-			SLAB_ACCOUNT, ntfs_big_inode_init_once);
-	if (!ntfs_big_inode_cache) {
-		pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
-		goto big_inode_err_out;
-	}
-
-	/* Register the ntfs sysctls. */
-	err = ntfs_sysctl(1);
-	if (err) {
-		pr_crit("Failed to register NTFS sysctls!\n");
-		goto sysctl_err_out;
-	}
-
-	err = register_filesystem(&ntfs_fs_type);
-	if (!err) {
-		ntfs_debug("NTFS driver registered successfully.");
-		return 0; /* Success! */
-	}
-	pr_crit("Failed to register NTFS filesystem driver!\n");
-
-	/* Unregister the ntfs sysctls. */
-	ntfs_sysctl(0);
-sysctl_err_out:
-	kmem_cache_destroy(ntfs_big_inode_cache);
-big_inode_err_out:
-	kmem_cache_destroy(ntfs_inode_cache);
-inode_err_out:
-	kmem_cache_destroy(ntfs_name_cache);
-name_err_out:
-	kmem_cache_destroy(ntfs_attr_ctx_cache);
-actx_err_out:
-	kmem_cache_destroy(ntfs_index_ctx_cache);
-ictx_err_out:
-	if (!err) {
-		pr_crit("Aborting NTFS filesystem driver registration...\n");
-		err = -ENOMEM;
-	}
-	return err;
-}
-
-static void __exit exit_ntfs_fs(void)
-{
-	ntfs_debug("Unregistering NTFS driver.");
-
-	unregister_filesystem(&ntfs_fs_type);
-
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-	kmem_cache_destroy(ntfs_big_inode_cache);
-	kmem_cache_destroy(ntfs_inode_cache);
-	kmem_cache_destroy(ntfs_name_cache);
-	kmem_cache_destroy(ntfs_attr_ctx_cache);
-	kmem_cache_destroy(ntfs_index_ctx_cache);
-	/* Unregister the ntfs sysctls. */
-	ntfs_sysctl(0);
-}
-
-MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
-MODULE_VERSION(NTFS_VERSION);
-MODULE_LICENSE("GPL");
-#ifdef DEBUG
-module_param(debug_msgs, bint, 0);
-MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
-#endif
-
-module_init(init_ntfs_fs)
-module_exit(exit_ntfs_fs)
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
deleted file mode 100644
index 4e980170d86a..000000000000
--- a/fs/ntfs/sysctl.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
- *	      the Linux-NTFS project. Adapted from the old NTFS driver,
- *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
- *
- * Copyright (c) 2002-2005 Anton Altaparmakov
- */
-
-#ifdef DEBUG
-
-#include <linux/module.h>
-
-#ifdef CONFIG_SYSCTL
-
-#include <linux/proc_fs.h>
-#include <linux/sysctl.h>
-
-#include "sysctl.h"
-#include "debug.h"
-
-/* Definition of the ntfs sysctl. */
-static struct ctl_table ntfs_sysctls[] = {
-	{
-		.procname	= "ntfs-debug",
-		.data		= &debug_msgs,		/* Data pointer and size. */
-		.maxlen		= sizeof(debug_msgs),
-		.mode		= 0644,			/* Mode, proc handler. */
-		.proc_handler	= proc_dointvec
-	},
-};
-
-/* Storage for the sysctls header. */
-static struct ctl_table_header *sysctls_root_table;
-
-/**
- * ntfs_sysctl - add or remove the debug sysctl
- * @add:	add (1) or remove (0) the sysctl
- *
- * Add or remove the debug sysctl. Return 0 on success or -errno on error.
- */
-int ntfs_sysctl(int add)
-{
-	if (add) {
-		BUG_ON(sysctls_root_table);
-		sysctls_root_table = register_sysctl("fs", ntfs_sysctls);
-		if (!sysctls_root_table)
-			return -ENOMEM;
-	} else {
-		BUG_ON(!sysctls_root_table);
-		unregister_sysctl_table(sysctls_root_table);
-		sysctls_root_table = NULL;
-	}
-	return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-#endif /* DEBUG */
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
deleted file mode 100644
index 96bb2299d2d5..000000000000
--- a/fs/ntfs/sysctl.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
- *	      the Linux-NTFS project. Adapted from the old NTFS driver,
- *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
- *
- * Copyright (c) 2002-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_SYSCTL_H
-#define _LINUX_NTFS_SYSCTL_H
-
-
-#if defined(DEBUG) && defined(CONFIG_SYSCTL)
-
-extern int ntfs_sysctl(int add);
-
-#else
-
-/* Just return success. */
-static inline int ntfs_sysctl(int add)
-{
-	return 0;
-}
-
-#endif /* DEBUG && CONFIG_SYSCTL */
-#endif /* _LINUX_NTFS_SYSCTL_H */
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
deleted file mode 100644
index 6b63261300cc..000000000000
--- a/fs/ntfs/time.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * time.h - NTFS time conversion functions.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_TIME_H
-#define _LINUX_NTFS_TIME_H
-
-#include <linux/time.h>		/* For current_kernel_time(). */
-#include <asm/div64.h>		/* For do_div(). */
-
-#include "endian.h"
-
-#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000)
-
-/**
- * utc2ntfs - convert Linux UTC time to NTFS time
- * @ts:		Linux UTC time to convert to NTFS time
- *
- * Convert the Linux UTC time @ts to its corresponding NTFS time and return
- * that in little endian format.
- *
- * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
- * and a long tv_nsec where tv_sec is the number of 1-second intervals since
- * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
- * intervals since the value of tv_sec.
- *
- * NTFS uses Microsoft's standard time format which is stored in a s64 and is
- * measured as the number of 100-nano-second intervals since 1st January 1601,
- * 00:00:00 UTC.
- */
-static inline sle64 utc2ntfs(const struct timespec64 ts)
-{
-	/*
-	 * Convert the seconds to 100ns intervals, add the nano-seconds
-	 * converted to 100ns intervals, and then add the NTFS time offset.
-	 */
-	return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 +
-			NTFS_TIME_OFFSET);
-}
-
-/**
- * get_current_ntfs_time - get the current time in little endian NTFS format
- *
- * Get the current time from the Linux kernel, convert it to its corresponding
- * NTFS time and return that in little endian format.
- */
-static inline sle64 get_current_ntfs_time(void)
-{
-	struct timespec64 ts;
-
-	ktime_get_coarse_real_ts64(&ts);
-	return utc2ntfs(ts);
-}
-
-/**
- * ntfs2utc - convert NTFS time to Linux time
- * @time:	NTFS time (little endian) to convert to Linux UTC
- *
- * Convert the little endian NTFS time @time to its corresponding Linux UTC
- * time and return that in cpu format.
- *
- * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
- * and a long tv_nsec where tv_sec is the number of 1-second intervals since
- * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
- * intervals since the value of tv_sec.
- *
- * NTFS uses Microsoft's standard time format which is stored in a s64 and is
- * measured as the number of 100 nano-second intervals since 1st January 1601,
- * 00:00:00 UTC.
- */
-static inline struct timespec64 ntfs2utc(const sle64 time)
-{
-	struct timespec64 ts;
-
-	/* Subtract the NTFS time offset. */
-	u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
-	/*
-	 * Convert the time to 1-second intervals and the remainder to
-	 * 1-nano-second intervals.
-	 */
-	ts.tv_nsec = do_div(t, 10000000) * 100;
-	ts.tv_sec = t;
-	return ts;
-}
-
-#endif /* _LINUX_NTFS_TIME_H */
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
deleted file mode 100644
index 9a47859e7a06..000000000000
--- a/fs/ntfs/types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * types.h - Defines for NTFS Linux kernel driver specific types.
- *	     Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_TYPES_H
-#define _LINUX_NTFS_TYPES_H
-
-#include <linux/types.h>
-
-typedef __le16 le16;
-typedef __le32 le32;
-typedef __le64 le64;
-typedef __u16 __bitwise sle16;
-typedef __u32 __bitwise sle32;
-typedef __u64 __bitwise sle64;
-
-/* 2-byte Unicode character type. */
-typedef le16 ntfschar;
-#define UCHAR_T_SIZE_BITS 1
-
-/*
- * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN
- * and VCN, to allow for type checking and better code readability.
- */
-typedef s64 VCN;
-typedef sle64 leVCN;
-typedef s64 LCN;
-typedef sle64 leLCN;
-
-/*
- * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit
- * values.  We define our own type LSN, to allow for type checking and better
- * code readability.
- */
-typedef s64 LSN;
-typedef sle64 leLSN;
-
-/*
- * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values.
- * We define our own type USN, to allow for type checking and better code
- * readability.
- */
-typedef s64 USN;
-typedef sle64 leUSN;
-
-typedef enum {
-	CASE_SENSITIVE = 0,
-	IGNORE_CASE = 1,
-} IGNORE_CASE_BOOL;
-
-#endif /* _LINUX_NTFS_TYPES_H */
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
deleted file mode 100644
index a6b6c64f14a9..000000000000
--- a/fs/ntfs/unistr.c
+++ /dev/null
@@ -1,384 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include <linux/slab.h>
-
-#include "types.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/*
- * IMPORTANT
- * =========
- *
- * All these routines assume that the Unicode characters are in little endian
- * encoding inside the strings!!!
- */
-
-/*
- * This is used by the name collation functions to quickly determine what
- * characters are (in)valid.
- */
-static const u8 legal_ansi_char_array[0x40] = {
-	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-
-	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-
-	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
-	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
-
-	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
-	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
-};
-
-/**
- * ntfs_are_names_equal - compare two Unicode names for equality
- * @s1:			name to compare to @s2
- * @s1_len:		length in Unicode characters of @s1
- * @s2:			name to compare to @s1
- * @s2_len:		length in Unicode characters of @s2
- * @ic:			ignore case bool
- * @upcase:		upcase table (only if @ic == IGNORE_CASE)
- * @upcase_size:	length in Unicode characters of @upcase (if present)
- *
- * Compare the names @s1 and @s2 and return 'true' (1) if the names are
- * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
- * the @upcase table is used to performa a case insensitive comparison.
- */
-bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
-		const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_size)
-{
-	if (s1_len != s2_len)
-		return false;
-	if (ic == CASE_SENSITIVE)
-		return !ntfs_ucsncmp(s1, s2, s1_len);
-	return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
-}
-
-/**
- * ntfs_collate_names - collate two Unicode names
- * @name1:	first Unicode name to compare
- * @name2:	second Unicode name to compare
- * @err_val:	if @name1 contains an invalid character return this value
- * @ic:		either CASE_SENSITIVE or IGNORE_CASE
- * @upcase:	upcase table (ignored if @ic is CASE_SENSITIVE)
- * @upcase_len:	upcase table size (ignored if @ic is CASE_SENSITIVE)
- *
- * ntfs_collate_names collates two Unicode names and returns:
- *
- *  -1 if the first name collates before the second one,
- *   0 if the names match,
- *   1 if the second name collates before the first one, or
- * @err_val if an invalid character is found in @name1 during the comparison.
- *
- * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
- */
-int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
-		const ntfschar *name2, const u32 name2_len,
-		const int err_val, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_len)
-{
-	u32 cnt, min_len;
-	u16 c1, c2;
-
-	min_len = name1_len;
-	if (name1_len > name2_len)
-		min_len = name2_len;
-	for (cnt = 0; cnt < min_len; ++cnt) {
-		c1 = le16_to_cpu(*name1++);
-		c2 = le16_to_cpu(*name2++);
-		if (ic) {
-			if (c1 < upcase_len)
-				c1 = le16_to_cpu(upcase[c1]);
-			if (c2 < upcase_len)
-				c2 = le16_to_cpu(upcase[c2]);
-		}
-		if (c1 < 64 && legal_ansi_char_array[c1] & 8)
-			return err_val;
-		if (c1 < c2)
-			return -1;
-		if (c1 > c2)
-			return 1;
-	}
-	if (name1_len < name2_len)
-		return -1;
-	if (name1_len == name2_len)
-		return 0;
-	/* name1_len > name2_len */
-	c1 = le16_to_cpu(*name1);
-	if (c1 < 64 && legal_ansi_char_array[c1] & 8)
-		return err_val;
-	return 1;
-}
-
-/**
- * ntfs_ucsncmp - compare two little endian Unicode strings
- * @s1:		first string
- * @s2:		second string
- * @n:		maximum unicode characters to compare
- *
- * Compare the first @n characters of the Unicode strings @s1 and @s2,
- * The strings in little endian format and appropriate le16_to_cpu()
- * conversion is performed on non-little endian machines.
- *
- * The function returns an integer less than, equal to, or greater than zero
- * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
- * to be less than, to match, or be greater than @s2.
- */
-int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
-{
-	u16 c1, c2;
-	size_t i;
-
-	for (i = 0; i < n; ++i) {
-		c1 = le16_to_cpu(s1[i]);
-		c2 = le16_to_cpu(s2[i]);
-		if (c1 < c2)
-			return -1;
-		if (c1 > c2)
-			return 1;
-		if (!c1)
-			break;
-	}
-	return 0;
-}
-
-/**
- * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
- * @s1:			first string
- * @s2:			second string
- * @n:			maximum unicode characters to compare
- * @upcase:		upcase table
- * @upcase_size:	upcase table size in Unicode characters
- *
- * Compare the first @n characters of the Unicode strings @s1 and @s2,
- * ignoring case. The strings in little endian format and appropriate
- * le16_to_cpu() conversion is performed on non-little endian machines.
- *
- * Each character is uppercased using the @upcase table before the comparison.
- *
- * The function returns an integer less than, equal to, or greater than zero
- * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
- * to be less than, to match, or be greater than @s2.
- */
-int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
-		const ntfschar *upcase, const u32 upcase_size)
-{
-	size_t i;
-	u16 c1, c2;
-
-	for (i = 0; i < n; ++i) {
-		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
-			c1 = le16_to_cpu(upcase[c1]);
-		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
-			c2 = le16_to_cpu(upcase[c2]);
-		if (c1 < c2)
-			return -1;
-		if (c1 > c2)
-			return 1;
-		if (!c1)
-			break;
-	}
-	return 0;
-}
-
-void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
-		const u32 upcase_len)
-{
-	u32 i;
-	u16 u;
-
-	for (i = 0; i < name_len; i++)
-		if ((u = le16_to_cpu(name[i])) < upcase_len)
-			name[i] = upcase[u];
-}
-
-void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
-		const ntfschar *upcase, const u32 upcase_len)
-{
-	ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
-			file_name_attr->file_name_length, upcase, upcase_len);
-}
-
-int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
-		FILE_NAME_ATTR *file_name_attr2,
-		const int err_val, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_len)
-{
-	return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
-			file_name_attr1->file_name_length,
-			(ntfschar*)&file_name_attr2->file_name,
-			file_name_attr2->file_name_length,
-			err_val, ic, upcase, upcase_len);
-}
-
-/**
- * ntfs_nlstoucs - convert NLS string to little endian Unicode string
- * @vol:	ntfs volume which we are working with
- * @ins:	input NLS string buffer
- * @ins_len:	length of input string in bytes
- * @outs:	on return contains the allocated output Unicode string buffer
- *
- * Convert the input string @ins, which is in whatever format the loaded NLS
- * map dictates, into a little endian, 2-byte Unicode string.
- *
- * This function allocates the string and the caller is responsible for
- * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
- *
- * On success the function returns the number of Unicode characters written to
- * the output string *@outs (>= 0), not counting the terminating Unicode NULL
- * character. *@outs is set to the allocated output string buffer.
- *
- * On error, a negative number corresponding to the error code is returned. In
- * that case the output string is not allocated. Both *@outs and *@outs_len
- * are then undefined.
- *
- * This might look a bit odd due to fast path optimization...
- */
-int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
-		const int ins_len, ntfschar **outs)
-{
-	struct nls_table *nls = vol->nls_map;
-	ntfschar *ucs;
-	wchar_t wc;
-	int i, o, wc_len;
-
-	/* We do not trust outside sources. */
-	if (likely(ins)) {
-		ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
-		if (likely(ucs)) {
-			for (i = o = 0; i < ins_len; i += wc_len) {
-				wc_len = nls->char2uni(ins + i, ins_len - i,
-						&wc);
-				if (likely(wc_len >= 0 &&
-						o < NTFS_MAX_NAME_LEN)) {
-					if (likely(wc)) {
-						ucs[o++] = cpu_to_le16(wc);
-						continue;
-					} /* else if (!wc) */
-					break;
-				} /* else if (wc_len < 0 ||
-						o >= NTFS_MAX_NAME_LEN) */
-				goto name_err;
-			}
-			ucs[o] = 0;
-			*outs = ucs;
-			return o;
-		} /* else if (!ucs) */
-		ntfs_error(vol->sb, "Failed to allocate buffer for converted "
-				"name from ntfs_name_cache.");
-		return -ENOMEM;
-	} /* else if (!ins) */
-	ntfs_error(vol->sb, "Received NULL pointer.");
-	return -EINVAL;
-name_err:
-	kmem_cache_free(ntfs_name_cache, ucs);
-	if (wc_len < 0) {
-		ntfs_error(vol->sb, "Name using character set %s contains "
-				"characters that cannot be converted to "
-				"Unicode.", nls->charset);
-		i = -EILSEQ;
-	} else /* if (o >= NTFS_MAX_NAME_LEN) */ {
-		ntfs_error(vol->sb, "Name is too long (maximum length for a "
-				"name on NTFS is %d Unicode characters.",
-				NTFS_MAX_NAME_LEN);
-		i = -ENAMETOOLONG;
-	}
-	return i;
-}
-
-/**
- * ntfs_ucstonls - convert little endian Unicode string to NLS string
- * @vol:	ntfs volume which we are working with
- * @ins:	input Unicode string buffer
- * @ins_len:	length of input string in Unicode characters
- * @outs:	on return contains the (allocated) output NLS string buffer
- * @outs_len:	length of output string buffer in bytes
- *
- * Convert the input little endian, 2-byte Unicode string @ins, of length
- * @ins_len into the string format dictated by the loaded NLS.
- *
- * If *@outs is NULL, this function allocates the string and the caller is
- * responsible for calling kfree(*@outs); when finished with it. In this case
- * @outs_len is ignored and can be 0.
- *
- * On success the function returns the number of bytes written to the output
- * string *@outs (>= 0), not counting the terminating NULL byte. If the output
- * string buffer was allocated, *@outs is set to it.
- *
- * On error, a negative number corresponding to the error code is returned. In
- * that case the output string is not allocated. The contents of *@outs are
- * then undefined.
- *
- * This might look a bit odd due to fast path optimization...
- */
-int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
-		const int ins_len, unsigned char **outs, int outs_len)
-{
-	struct nls_table *nls = vol->nls_map;
-	unsigned char *ns;
-	int i, o, ns_len, wc;
-
-	/* We don't trust outside sources. */
-	if (ins) {
-		ns = *outs;
-		ns_len = outs_len;
-		if (ns && !ns_len) {
-			wc = -ENAMETOOLONG;
-			goto conversion_err;
-		}
-		if (!ns) {
-			ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
-			ns = kmalloc(ns_len + 1, GFP_NOFS);
-			if (!ns)
-				goto mem_err_out;
-		}
-		for (i = o = 0; i < ins_len; i++) {
-retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
-					ns_len - o);
-			if (wc > 0) {
-				o += wc;
-				continue;
-			} else if (!wc)
-				break;
-			else if (wc == -ENAMETOOLONG && ns != *outs) {
-				unsigned char *tc;
-				/* Grow in multiples of 64 bytes. */
-				tc = kmalloc((ns_len + 64) &
-						~63, GFP_NOFS);
-				if (tc) {
-					memcpy(tc, ns, ns_len);
-					ns_len = ((ns_len + 64) & ~63) - 1;
-					kfree(ns);
-					ns = tc;
-					goto retry;
-				} /* No memory so goto conversion_error; */
-			} /* wc < 0, real error. */
-			goto conversion_err;
-		}
-		ns[o] = 0;
-		*outs = ns;
-		return o;
-	} /* else (!ins) */
-	ntfs_error(vol->sb, "Received NULL pointer.");
-	return -EINVAL;
-conversion_err:
-	ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
-			"converted to character set %s.  You might want to "
-			"try to use the mount option nls=utf8.", nls->charset);
-	if (ns != *outs)
-		kfree(ns);
-	if (wc != -ENAMETOOLONG)
-		wc = -EILSEQ;
-	return wc;
-mem_err_out:
-	ntfs_error(vol->sb, "Failed to allocate name!");
-	return -ENOMEM;
-}
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
deleted file mode 100644
index 4ebe84a78dea..000000000000
--- a/fs/ntfs/upcase.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
- *	      Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include "malloc.h"
-#include "ntfs.h"
-
-ntfschar *generate_default_upcase(void)
-{
-	static const int uc_run_table[][3] = { /* Start, End, Add */
-	{0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
-	{0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
-	{0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
-	{0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
-	{0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
-	{0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
-	{0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
-	{0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
-	{0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
-	{0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
-	{0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
-	{0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
-	{0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
-	{0}
-	};
-
-	static const int uc_dup_table[][2] = { /* Start, End */
-	{0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
-	{0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
-	{0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
-	{0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
-	{0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
-	{0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
-	{0}
-	};
-
-	static const int uc_word_table[][2] = { /* Offset, Value */
-	{0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
-	{0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
-	{0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
-	{0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
-	{0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
-	{0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
-	{0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
-	{0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
-	{0}
-	};
-
-	int i, r;
-	ntfschar *uc;
-
-	uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar));
-	if (!uc)
-		return uc;
-	memset(uc, 0, default_upcase_len * sizeof(ntfschar));
-	/* Generate the little endian Unicode upcase table used by ntfs. */
-	for (i = 0; i < default_upcase_len; i++)
-		uc[i] = cpu_to_le16(i);
-	for (r = 0; uc_run_table[r][0]; r++)
-		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
-			le16_add_cpu(&uc[i], uc_run_table[r][2]);
-	for (r = 0; uc_dup_table[r][0]; r++)
-		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
-			le16_add_cpu(&uc[i + 1], -1);
-	for (r = 0; uc_word_table[r][0]; r++)
-		uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
-	return uc;
-}
diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c
deleted file mode 100644
index 9097a0b4ef25..000000000000
--- a/fs/ntfs/usnjrnl.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling.  Part of the
- *	       Linux-NTFS project.
- *
- * Copyright (c) 2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-
-#include "aops.h"
-#include "debug.h"
-#include "endian.h"
-#include "time.h"
-#include "types.h"
-#include "usnjrnl.h"
-#include "volume.h"
-
-/**
- * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume
- * @vol:	ntfs volume on which to stamp the transaction log
- *
- * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return
- * 'true' on success and 'false' on error.
- *
- * This function assumes that the transaction log has already been loaded and
- * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl().
- */
-bool ntfs_stamp_usnjrnl(ntfs_volume *vol)
-{
-	ntfs_debug("Entering.");
-	if (likely(!NVolUsnJrnlStamped(vol))) {
-		sle64 stamp;
-		struct page *page;
-		USN_HEADER *uh;
-
-		page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
-		if (IS_ERR(page)) {
-			ntfs_error(vol->sb, "Failed to read from "
-					"$UsnJrnl/$DATA/$Max attribute.");
-			return false;
-		}
-		uh = (USN_HEADER*)page_address(page);
-		stamp = get_current_ntfs_time();
-		ntfs_debug("Stamping transaction log ($UsnJrnl): old "
-				"journal_id 0x%llx, old lowest_valid_usn "
-				"0x%llx, new journal_id 0x%llx, new "
-				"lowest_valid_usn 0x%llx.",
-				(long long)sle64_to_cpu(uh->journal_id),
-				(long long)sle64_to_cpu(uh->lowest_valid_usn),
-				(long long)sle64_to_cpu(stamp),
-				i_size_read(vol->usnjrnl_j_ino));
-		uh->lowest_valid_usn =
-				cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino));
-		uh->journal_id = stamp;
-		flush_dcache_page(page);
-		set_page_dirty(page);
-		ntfs_unmap_page(page);
-		/* Set the flag so we do not have to do it again on remount. */
-		NVolSetUsnJrnlStamped(vol);
-	}
-	ntfs_debug("Done.");
-	return true;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
deleted file mode 100644
index 85f531b59395..000000000000
--- a/fs/ntfs/usnjrnl.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling.
- *	       Part of the Linux-NTFS project.
- *
- * Copyright (c) 2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_USNJRNL_H
-#define _LINUX_NTFS_USNJRNL_H
-
-#ifdef NTFS_RW
-
-#include "types.h"
-#include "endian.h"
-#include "layout.h"
-#include "volume.h"
-
-/*
- * Transaction log ($UsnJrnl) organization:
- *
- * The transaction log records whenever a file is modified in any way.  So for
- * example it will record that file "blah" was written to at a particular time
- * but not what was written.  If will record that a file was deleted or
- * created, that a file was truncated, etc.  See below for all the reason
- * codes used.
- *
- * The transaction log is in the $Extend directory which is in the root
- * directory of each volume.  If it is not present it means transaction
- * logging is disabled.  If it is present it means transaction logging is
- * either enabled or in the process of being disabled in which case we can
- * ignore it as it will go away as soon as Windows gets its hands on it.
- *
- * To determine whether the transaction logging is enabled or in the process
- * of being disabled, need to check the volume flags in the
- * $VOLUME_INFORMATION attribute in the $Volume system file (which is present
- * in the root directory and has a fixed mft record number, see layout.h).
- * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log
- * is in the process of being disabled and if this flag is clear it means the
- * transaction log is enabled.
- *
- * The transaction log consists of two parts; the $DATA/$Max attribute as well
- * as the $DATA/$J attribute.  $Max is a header describing the transaction
- * log whilst $J is the transaction log data itself as a sequence of variable
- * sized USN_RECORDs (see below for all the structures).
- *
- * We do not care about transaction logging at this point in time but we still
- * need to let windows know that the transaction log is out of date.  To do
- * this we need to stamp the transaction log.  This involves setting the
- * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used
- * for the next added USN_RECORD to the $DATA/$J attribute as well as
- * generating a new journal_id in $DATA/$Max.
- *
- * The journal_id is as of the current version (2.0) of the transaction log
- * simply the 64-bit timestamp of when the journal was either created or last
- * stamped.
- *
- * To determine the next usn there are two ways.  The first is to parse
- * $DATA/$J and to find the last USN_RECORD in it and to add its record_length
- * to its usn (which is the byte offset in the $DATA/$J attribute).  The
- * second is simply to take the data size of the attribute.  Since the usns
- * are simply byte offsets into $DATA/$J, this is exactly the next usn.  For
- * obvious reasons we use the second method as it is much simpler and faster.
- *
- * As an aside, note that to actually disable the transaction log, one would
- * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go
- * through all the mft records on the volume and set the usn field in their
- * $STANDARD_INFORMATION attribute to zero.  Once that is done, one would need
- * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally,
- * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag.
- *
- * Note that if a volume is unmounted whilst the transaction log is being
- * disabled, the process will continue the next time the volume is mounted.
- * This is why we can safely mount read-write when we see a transaction log
- * in the process of being deleted.
- */
-
-/* Some $UsnJrnl related constants. */
-#define UsnJrnlMajorVer		2
-#define UsnJrnlMinorVer		0
-
-/*
- * $DATA/$Max attribute.  This is (always?) resident and has a fixed size of
- * 32 bytes.  It contains the header describing the transaction log.
- */
-typedef struct {
-/*Ofs*/
-/*   0*/sle64 maximum_size;	/* The maximum on-disk size of the $DATA/$J
-				   attribute. */
-/*   8*/sle64 allocation_delta;	/* Number of bytes by which to increase the
-				   size of the $DATA/$J attribute. */
-/*0x10*/sle64 journal_id;	/* Current id of the transaction log. */
-/*0x18*/leUSN lowest_valid_usn;	/* Lowest valid usn in $DATA/$J for the
-				   current journal_id. */
-/* sizeof() = 32 (0x20) bytes */
-} __attribute__ ((__packed__)) USN_HEADER;
-
-/*
- * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
- * file since it was last opened.  I think the names speak for themselves but
- * if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://www.linux-ntfs.org/
- */
-enum {
-	USN_REASON_DATA_OVERWRITE	= cpu_to_le32(0x00000001),
-	USN_REASON_DATA_EXTEND		= cpu_to_le32(0x00000002),
-	USN_REASON_DATA_TRUNCATION	= cpu_to_le32(0x00000004),
-	USN_REASON_NAMED_DATA_OVERWRITE	= cpu_to_le32(0x00000010),
-	USN_REASON_NAMED_DATA_EXTEND	= cpu_to_le32(0x00000020),
-	USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
-	USN_REASON_FILE_CREATE		= cpu_to_le32(0x00000100),
-	USN_REASON_FILE_DELETE		= cpu_to_le32(0x00000200),
-	USN_REASON_EA_CHANGE		= cpu_to_le32(0x00000400),
-	USN_REASON_SECURITY_CHANGE	= cpu_to_le32(0x00000800),
-	USN_REASON_RENAME_OLD_NAME	= cpu_to_le32(0x00001000),
-	USN_REASON_RENAME_NEW_NAME	= cpu_to_le32(0x00002000),
-	USN_REASON_INDEXABLE_CHANGE	= cpu_to_le32(0x00004000),
-	USN_REASON_BASIC_INFO_CHANGE	= cpu_to_le32(0x00008000),
-	USN_REASON_HARD_LINK_CHANGE	= cpu_to_le32(0x00010000),
-	USN_REASON_COMPRESSION_CHANGE	= cpu_to_le32(0x00020000),
-	USN_REASON_ENCRYPTION_CHANGE	= cpu_to_le32(0x00040000),
-	USN_REASON_OBJECT_ID_CHANGE	= cpu_to_le32(0x00080000),
-	USN_REASON_REPARSE_POINT_CHANGE	= cpu_to_le32(0x00100000),
-	USN_REASON_STREAM_CHANGE	= cpu_to_le32(0x00200000),
-	USN_REASON_CLOSE		= cpu_to_le32(0x80000000),
-};
-
-typedef le32 USN_REASON_FLAGS;
-
-/*
- * Source info flags (32-bit).  Information about the source of the change(s)
- * to the file.  For detailed descriptions of what these mean, see the Linux
- * NTFS project NTFS documentation:
- *	http://www.linux-ntfs.org/
- */
-enum {
-	USN_SOURCE_DATA_MANAGEMENT	  = cpu_to_le32(0x00000001),
-	USN_SOURCE_AUXILIARY_DATA	  = cpu_to_le32(0x00000002),
-	USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
-};
-
-typedef le32 USN_SOURCE_INFO_FLAGS;
-
-/*
- * $DATA/$J attribute.  This is always non-resident, is marked as sparse, and
- * is of variabled size.  It consists of a sequence of variable size
- * USN_RECORDS.  The minimum allocated_size is allocation_delta as
- * specified in $DATA/$Max.  When the maximum_size specified in $DATA/$Max is
- * exceeded by more than allocation_delta bytes, allocation_delta bytes are
- * allocated and appended to the $DATA/$J attribute and an equal number of
- * bytes at the beginning of the attribute are freed and made sparse.  Note the
- * making sparse only happens at volume checkpoints and hence the actual
- * $DATA/$J size can exceed maximum_size + allocation_delta temporarily.
- */
-typedef struct {
-/*Ofs*/
-/*   0*/le32 length;		/* Byte size of this record (8-byte
-				   aligned). */
-/*   4*/le16 major_ver;		/* Major version of the transaction log used
-				   for this record. */
-/*   6*/le16 minor_ver;		/* Minor version of the transaction log used
-				   for this record. */
-/*   8*/leMFT_REF mft_reference;/* The mft reference of the file (or
-				   directory) described by this record. */
-/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent
-				   directory of the file described by this
-				   record. */
-/*0x18*/leUSN usn;		/* The usn of this record.  Equals the offset
-				   within the $DATA/$J attribute. */
-/*0x20*/sle64 time;		/* Time when this record was created. */
-/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */
-/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */
-/*0x30*/le32 security_id;	/* File security_id copied from
-				   $STANDARD_INFORMATION. */
-/*0x34*/FILE_ATTR_FLAGS file_attributes;	/* File attributes copied from
-				   $STANDARD_INFORMATION or $FILE_NAME (not
-				   sure which). */
-/*0x38*/le16 file_name_size;	/* Size of the file name in bytes. */
-/*0x3a*/le16 file_name_offset;	/* Offset to the file name in bytes from the
-				   start of this record. */
-/*0x3c*/ntfschar file_name[0];	/* Use when creating only.  When reading use
-				   file_name_offset to determine the location
-				   of the name. */
-/* sizeof() = 60 (0x3c) bytes */
-} __attribute__ ((__packed__)) USN_RECORD;
-
-extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_USNJRNL_H */
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
deleted file mode 100644
index 930a9ae8a053..000000000000
--- a/fs/ntfs/volume.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
- *	      of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_VOLUME_H
-#define _LINUX_NTFS_VOLUME_H
-
-#include <linux/rwsem.h>
-#include <linux/uidgid.h>
-
-#include "types.h"
-#include "layout.h"
-
-/*
- * The NTFS in memory super block structure.
- */
-typedef struct {
-	/*
-	 * FIXME: Reorder to have commonly used together element within the
-	 * same cache line, aiming at a cache line size of 32 bytes. Aim for
-	 * 64 bytes for less commonly used together elements. Put most commonly
-	 * used elements to front of structure. Obviously do this only when the
-	 * structure has stabilized... (AIA)
-	 */
-	/* Device specifics. */
-	struct super_block *sb;		/* Pointer back to the super_block. */
-	LCN nr_blocks;			/* Number of sb->s_blocksize bytes
-					   sized blocks on the device. */
-	/* Configuration provided by user at mount time. */
-	unsigned long flags;		/* Miscellaneous flags, see below. */
-	kuid_t uid;			/* uid that files will be mounted as. */
-	kgid_t gid;			/* gid that files will be mounted as. */
-	umode_t fmask;			/* The mask for file permissions. */
-	umode_t dmask;			/* The mask for directory
-					   permissions. */
-	u8 mft_zone_multiplier;		/* Initial mft zone multiplier. */
-	u8 on_errors;			/* What to do on filesystem errors. */
-	/* NTFS bootsector provided information. */
-	u16 sector_size;		/* in bytes */
-	u8 sector_size_bits;		/* log2(sector_size) */
-	u32 cluster_size;		/* in bytes */
-	u32 cluster_size_mask;		/* cluster_size - 1 */
-	u8 cluster_size_bits;		/* log2(cluster_size) */
-	u32 mft_record_size;		/* in bytes */
-	u32 mft_record_size_mask;	/* mft_record_size - 1 */
-	u8 mft_record_size_bits;	/* log2(mft_record_size) */
-	u32 index_record_size;		/* in bytes */
-	u32 index_record_size_mask;	/* index_record_size - 1 */
-	u8 index_record_size_bits;	/* log2(index_record_size) */
-	LCN nr_clusters;		/* Volume size in clusters == number of
-					   bits in lcn bitmap. */
-	LCN mft_lcn;			/* Cluster location of mft data. */
-	LCN mftmirr_lcn;		/* Cluster location of copy of mft. */
-	u64 serial_no;			/* The volume serial number. */
-	/* Mount specific NTFS information. */
-	u32 upcase_len;			/* Number of entries in upcase[]. */
-	ntfschar *upcase;		/* The upcase table. */
-
-	s32 attrdef_size;		/* Size of the attribute definition
-					   table in bytes. */
-	ATTR_DEF *attrdef;		/* Table of attribute definitions.
-					   Obtained from FILE_AttrDef. */
-
-#ifdef NTFS_RW
-	/* Variables used by the cluster and mft allocators. */
-	s64 mft_data_pos;		/* Mft record number at which to
-					   allocate the next mft record. */
-	LCN mft_zone_start;		/* First cluster of the mft zone. */
-	LCN mft_zone_end;		/* First cluster beyond the mft zone. */
-	LCN mft_zone_pos;		/* Current position in the mft zone. */
-	LCN data1_zone_pos;		/* Current position in the first data
-					   zone. */
-	LCN data2_zone_pos;		/* Current position in the second data
-					   zone. */
-#endif /* NTFS_RW */
-
-	struct inode *mft_ino;		/* The VFS inode of $MFT. */
-
-	struct inode *mftbmp_ino;	/* Attribute inode for $MFT/$BITMAP. */
-	struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the
-					    mft record bitmap ($MFT/$BITMAP). */
-#ifdef NTFS_RW
-	struct inode *mftmirr_ino;	/* The VFS inode of $MFTMirr. */
-	int mftmirr_size;		/* Size of mft mirror in mft records. */
-
-	struct inode *logfile_ino;	/* The VFS inode of $LogFile. */
-#endif /* NTFS_RW */
-
-	struct inode *lcnbmp_ino;	/* The VFS inode of $Bitmap. */
-	struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the
-					    cluster bitmap ($Bitmap/$DATA). */
-
-	struct inode *vol_ino;		/* The VFS inode of $Volume. */
-	VOLUME_FLAGS vol_flags;		/* Volume flags. */
-	u8 major_ver;			/* Ntfs major version of volume. */
-	u8 minor_ver;			/* Ntfs minor version of volume. */
-
-	struct inode *root_ino;		/* The VFS inode of the root
-					   directory. */
-	struct inode *secure_ino;	/* The VFS inode of $Secure (NTFS3.0+
-					   only, otherwise NULL). */
-	struct inode *extend_ino;	/* The VFS inode of $Extend (NTFS3.0+
-					   only, otherwise NULL). */
-#ifdef NTFS_RW
-	/* $Quota stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
-	struct inode *quota_ino;	/* The VFS inode of $Quota. */
-	struct inode *quota_q_ino;	/* Attribute inode for $Quota/$Q. */
-	/* $UsnJrnl stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
-	struct inode *usnjrnl_ino;	/* The VFS inode of $UsnJrnl. */
-	struct inode *usnjrnl_max_ino;	/* Attribute inode for $UsnJrnl/$Max. */
-	struct inode *usnjrnl_j_ino;	/* Attribute inode for $UsnJrnl/$J. */
-#endif /* NTFS_RW */
-	struct nls_table *nls_map;
-} ntfs_volume;
-
-/*
- * Defined bits for the flags field in the ntfs_volume structure.
- */
-typedef enum {
-	NV_Errors,		/* 1: Volume has errors, prevent remount rw. */
-	NV_ShowSystemFiles,	/* 1: Return system files in ntfs_readdir(). */
-	NV_CaseSensitive,	/* 1: Treat file names as case sensitive and
-				      create filenames in the POSIX namespace.
-				      Otherwise be case insensitive but still
-				      create file names in POSIX namespace. */
-	NV_LogFileEmpty,	/* 1: $LogFile journal is empty. */
-	NV_QuotaOutOfDate,	/* 1: $Quota is out of date. */
-	NV_UsnJrnlStamped,	/* 1: $UsnJrnl has been stamped. */
-	NV_SparseEnabled,	/* 1: May create sparse files. */
-} ntfs_volume_flags;
-
-/*
- * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
- * functions.
- */
-#define DEFINE_NVOL_BIT_OPS(flag)					\
-static inline int NVol##flag(ntfs_volume *vol)		\
-{							\
-	return test_bit(NV_##flag, &(vol)->flags);	\
-}							\
-static inline void NVolSet##flag(ntfs_volume *vol)	\
-{							\
-	set_bit(NV_##flag, &(vol)->flags);		\
-}							\
-static inline void NVolClear##flag(ntfs_volume *vol)	\
-{							\
-	clear_bit(NV_##flag, &(vol)->flags);		\
-}
-
-/* Emit the ntfs volume bitops functions. */
-DEFINE_NVOL_BIT_OPS(Errors)
-DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
-DEFINE_NVOL_BIT_OPS(CaseSensitive)
-DEFINE_NVOL_BIT_OPS(LogFileEmpty)
-DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
-DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
-DEFINE_NVOL_BIT_OPS(SparseEnabled)
-
-#endif /* _LINUX_NTFS_VOLUME_H */
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 63f70259edc0..7aadf5010999 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -886,7 +886,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
 	struct runs_tree *run = &ni->file.run;
 	struct ntfs_sb_info *sbi;
 	u8 cluster_bits;
-	struct ATTRIB *attr = NULL, *attr_b;
+	struct ATTRIB *attr, *attr_b;
 	struct ATTR_LIST_ENTRY *le, *le_b;
 	struct mft_inode *mi, *mi_b;
 	CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen;
@@ -904,12 +904,8 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
 		*len = 0;
 	up_read(&ni->file.run_lock);
 
-	if (*len) {
-		if (*lcn != SPARSE_LCN || !new)
-			return 0; /* Fast normal way without allocation. */
-		else if (clen > *len)
-			clen = *len;
-	}
+	if (*len && (*lcn != SPARSE_LCN || !new))
+		return 0; /* Fast normal way without allocation. */
 
 	/* No cluster in cache or we need to allocate cluster in hole. */
 	sbi = ni->mi.sbi;
@@ -918,6 +914,17 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
 	ni_lock(ni);
 	down_write(&ni->file.run_lock);
 
+	/* Repeat the code above (under write lock). */
+	if (!run_lookup_entry(run, vcn, lcn, len, NULL))
+		*len = 0;
+
+	if (*len) {
+		if (*lcn != SPARSE_LCN || !new)
+			goto out; /* normal way without allocation. */
+		if (clen > *len)
+			clen = *len;
+	}
+
 	le_b = NULL;
 	attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
 	if (!attr_b) {
@@ -1736,8 +1743,10 @@ repack:
 			le_b = NULL;
 			attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL,
 					      0, NULL, &mi_b);
-			if (!attr_b)
-				return -ENOENT;
+			if (!attr_b) {
+				err = -ENOENT;
+				goto out;
+			}
 
 			attr = attr_b;
 			le = le_b;
@@ -1818,13 +1827,15 @@ ins_ext:
 ok:
 	run_truncate_around(run, vcn);
 out:
-	if (new_valid > data_size)
-		new_valid = data_size;
+	if (attr_b) {
+		if (new_valid > data_size)
+			new_valid = data_size;
 
-	valid_size = le64_to_cpu(attr_b->nres.valid_size);
-	if (new_valid != valid_size) {
-		attr_b->nres.valid_size = cpu_to_le64(valid_size);
-		mi_b->dirty = true;
+		valid_size = le64_to_cpu(attr_b->nres.valid_size);
+		if (new_valid != valid_size) {
+			attr_b->nres.valid_size = cpu_to_le64(valid_size);
+			mi_b->dirty = true;
+		}
 	}
 
 	return err;
@@ -2073,7 +2084,7 @@ next_attr:
 
 	/* Update inode size. */
 	ni->i_valid = valid_size;
-	ni->vfs_inode.i_size = data_size;
+	i_size_write(&ni->vfs_inode, data_size);
 	inode_set_bytes(&ni->vfs_inode, total_size);
 	ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
 	mark_inode_dirty(&ni->vfs_inode);
@@ -2488,7 +2499,7 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
 	mi_b->dirty = true;
 
 done:
-	ni->vfs_inode.i_size += bytes;
+	i_size_write(&ni->vfs_inode, ni->vfs_inode.i_size + bytes);
 	ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
 	mark_inode_dirty(&ni->vfs_inode);
 
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
index 7c01735d1219..9f4bd8d26090 100644
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -29,7 +29,7 @@ static inline bool al_is_valid_le(const struct ntfs_inode *ni,
 void al_destroy(struct ntfs_inode *ni)
 {
 	run_close(&ni->attr_list.run);
-	kfree(ni->attr_list.le);
+	kvfree(ni->attr_list.le);
 	ni->attr_list.le = NULL;
 	ni->attr_list.size = 0;
 	ni->attr_list.dirty = false;
@@ -127,12 +127,13 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
 {
 	size_t off;
 	u16 sz;
+	const unsigned le_min_size = le_size(0);
 
 	if (!le) {
 		le = ni->attr_list.le;
 	} else {
 		sz = le16_to_cpu(le->size);
-		if (sz < sizeof(struct ATTR_LIST_ENTRY)) {
+		if (sz < le_min_size) {
 			/* Impossible 'cause we should not return such le. */
 			return NULL;
 		}
@@ -141,7 +142,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
 
 	/* Check boundary. */
 	off = PtrOffset(ni->attr_list.le, le);
-	if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) {
+	if (off + le_min_size > ni->attr_list.size) {
 		/* The regular end of list. */
 		return NULL;
 	}
@@ -149,8 +150,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
 	sz = le16_to_cpu(le->size);
 
 	/* Check le for errors. */
-	if (sz < sizeof(struct ATTR_LIST_ENTRY) ||
-	    off + sz > ni->attr_list.size ||
+	if (sz < le_min_size || off + sz > ni->attr_list.size ||
 	    sz < le->name_off + le->name_len * sizeof(short)) {
 		return NULL;
 	}
@@ -318,7 +318,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name,
 		memcpy(ptr, al->le, off);
 		memcpy(Add2Ptr(ptr, off + sz), le, old_size - off);
 		le = Add2Ptr(ptr, off);
-		kfree(al->le);
+		kvfree(al->le);
 		al->le = ptr;
 	} else {
 		memmove(Add2Ptr(le, sz), le, old_size - off);
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 63f14a0232f6..845f9b22deef 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -124,7 +124,7 @@ void wnd_close(struct wnd_bitmap *wnd)
 {
 	struct rb_node *node, *next;
 
-	kfree(wnd->free_bits);
+	kvfree(wnd->free_bits);
 	wnd->free_bits = NULL;
 	run_close(&wnd->run);
 
@@ -1360,7 +1360,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
 		memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short));
 		memset(new_free + wnd->nwnd, 0,
 		       (new_wnd - wnd->nwnd) * sizeof(short));
-		kfree(wnd->free_bits);
+		kvfree(wnd->free_bits);
 		wnd->free_bits = new_free;
 	}
 
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index ec0566b322d5..5cf3d9decf64 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -309,11 +309,31 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
 		return 0;
 	}
 
-	/* NTFS: symlinks are "dir + reparse" or "file + reparse" */
-	if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT)
-		dt_type = DT_LNK;
-	else
-		dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+	/*
+	 * NTFS: symlinks are "dir + reparse" or "file + reparse"
+	 * Unfortunately reparse attribute is used for many purposes (several dozens).
+	 * It is not possible here to know is this name symlink or not.
+	 * To get exactly the type of name we should to open inode (read mft).
+	 * getattr for opened file (fstat) correctly returns symlink.
+	 */
+	dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+
+	/*
+	 * It is not reliable to detect the type of name using duplicated information
+	 * stored in parent directory.
+	 * The only correct way to get the type of name - read MFT record and find ATTR_STD.
+	 * The code below is not good idea.
+	 * It does additional locks/reads just to get the type of name.
+	 * Should we use additional mount option to enable branch below?
+	 */
+	if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) &&
+	    ino != ni->mi.rno) {
+		struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL);
+		if (!IS_ERR_OR_NULL(inode)) {
+			dt_type = fs_umode_to_dtype(inode->i_mode);
+			iput(inode);
+		}
+	}
 
 	return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
 }
@@ -495,11 +515,9 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
 	struct INDEX_HDR *hdr;
 	const struct ATTR_FILE_NAME *fname;
 	u32 e_size, off, end;
-	u64 vbo = 0;
 	size_t drs = 0, fles = 0, bit = 0;
-	loff_t i_size = ni->vfs_inode.i_size;
 	struct indx_node *node = NULL;
-	u8 index_bits = ni->dir.index_bits;
+	size_t max_indx = i_size_read(&ni->vfs_inode) >> ni->dir.index_bits;
 
 	if (is_empty)
 		*is_empty = true;
@@ -518,8 +536,10 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
 			e = Add2Ptr(hdr, off);
 			e_size = le16_to_cpu(e->size);
 			if (e_size < sizeof(struct NTFS_DE) ||
-			    off + e_size > end)
+			    off + e_size > end) {
+				/* Looks like corruption. */
 				break;
+			}
 
 			if (de_is_last(e))
 				break;
@@ -543,7 +563,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
 				fles += 1;
 		}
 
-		if (vbo >= i_size)
+		if (bit >= max_indx)
 			goto out;
 
 		err = indx_used_bit(&ni->dir, ni, &bit);
@@ -553,8 +573,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
 		if (bit == MINUS_ONE_T)
 			goto out;
 
-		vbo = (u64)bit << index_bits;
-		if (vbo >= i_size)
+		if (bit >= max_indx)
 			goto out;
 
 		err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits,
@@ -564,7 +583,6 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
 
 		hdr = &node->index->ihdr;
 		bit += 1;
-		vbo = (u64)bit << ni->dir.idx2vbn_bits;
 	}
 
 out:
@@ -593,5 +611,9 @@ const struct file_operations ntfs_dir_operations = {
 	.iterate_shared	= ntfs_readdir,
 	.fsync		= generic_file_fsync,
 	.open		= ntfs_file_open,
+	.unlocked_ioctl = ntfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ntfs_compat_ioctl,
+#endif
 };
 // clang-format on
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index a5a30a24ce5d..5418662c80d8 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -48,7 +48,7 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
 	return 0;
 }
 
-static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
+long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 	struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
@@ -61,7 +61,7 @@ static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
 }
 
 #ifdef CONFIG_COMPAT
-static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg)
+long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg)
 
 {
 	return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
@@ -188,6 +188,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 	u32 bh_next, bh_off, to;
 	sector_t iblock;
 	struct folio *folio;
+	bool dirty = false;
 
 	for (; idx < idx_end; idx += 1, from = 0) {
 		page_off = (loff_t)idx << PAGE_SHIFT;
@@ -223,29 +224,27 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 			/* Ok, it's mapped. Make sure it's up-to-date. */
 			if (folio_test_uptodate(folio))
 				set_buffer_uptodate(bh);
-
-			if (!buffer_uptodate(bh)) {
-				err = bh_read(bh, 0);
-				if (err < 0) {
-					folio_unlock(folio);
-					folio_put(folio);
-					goto out;
-				}
+			else if (bh_read(bh, 0) < 0) {
+				err = -EIO;
+				folio_unlock(folio);
+				folio_put(folio);
+				goto out;
 			}
 
 			mark_buffer_dirty(bh);
-
 		} while (bh_off = bh_next, iblock += 1,
 			 head != (bh = bh->b_this_page));
 
 		folio_zero_segment(folio, from, to);
+		dirty = true;
 
 		folio_unlock(folio);
 		folio_put(folio);
 		cond_resched();
 	}
 out:
-	mark_inode_dirty(inode);
+	if (dirty)
+		mark_inode_dirty(inode);
 	return err;
 }
 
@@ -261,6 +260,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	bool rw = vma->vm_flags & VM_WRITE;
 	int err;
 
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
 	if (is_encrypted(ni)) {
 		ntfs_inode_warn(inode, "mmap encrypted not supported");
 		return -EOPNOTSUPP;
@@ -499,10 +501,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
 		ni_lock(ni);
 		err = attr_punch_hole(ni, vbo, len, &frame_size);
 		ni_unlock(ni);
+		if (!err)
+			goto ok;
+
 		if (err != E_NTFS_NOTALIGNED)
 			goto out;
 
 		/* Process not aligned punch. */
+		err = 0;
 		mask = frame_size - 1;
 		vbo_a = (vbo + mask) & ~mask;
 		end_a = end & ~mask;
@@ -525,6 +531,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
 			ni_lock(ni);
 			err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL);
 			ni_unlock(ni);
+			if (err)
+				goto out;
 		}
 	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
 		/*
@@ -564,6 +572,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
 		ni_lock(ni);
 		err = attr_insert_range(ni, vbo, len);
 		ni_unlock(ni);
+		if (err)
+			goto out;
 	} else {
 		/* Check new size. */
 		u8 cluster_bits = sbi->cluster_bits;
@@ -633,11 +643,18 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
 					    &ni->file.run, i_size, &ni->i_valid,
 					    true, NULL);
 			ni_unlock(ni);
+			if (err)
+				goto out;
 		} else if (new_size > i_size) {
-			inode->i_size = new_size;
+			i_size_write(inode, new_size);
 		}
 	}
 
+ok:
+	err = file_modified(file);
+	if (err)
+		goto out;
+
 out:
 	if (map_locked)
 		filemap_invalidate_unlock(mapping);
@@ -663,6 +680,9 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	umode_t mode = inode->i_mode;
 	int err;
 
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
 	err = setattr_prepare(idmap, dentry, attr);
 	if (err)
 		goto out;
@@ -676,7 +696,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 			goto out;
 		}
 		inode_dio_wait(inode);
-		oldsize = inode->i_size;
+		oldsize = i_size_read(inode);
 		newsize = attr->ia_size;
 
 		if (newsize <= oldsize)
@@ -688,7 +708,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 			goto out;
 
 		ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
-		inode->i_size = newsize;
+		i_size_write(inode, newsize);
 	}
 
 	setattr_copy(idmap, inode, attr);
@@ -718,6 +738,9 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	struct inode *inode = file->f_mapping->host;
 	struct ntfs_inode *ni = ntfs_i(inode);
 
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
 	if (is_encrypted(ni)) {
 		ntfs_inode_warn(inode, "encrypted i/o not supported");
 		return -EOPNOTSUPP;
@@ -752,6 +775,9 @@ static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos,
 	struct inode *inode = in->f_mapping->host;
 	struct ntfs_inode *ni = ntfs_i(inode);
 
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
 	if (is_encrypted(ni)) {
 		ntfs_inode_warn(inode, "encrypted i/o not supported");
 		return -EOPNOTSUPP;
@@ -821,7 +847,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
 	size_t count = iov_iter_count(from);
 	loff_t pos = iocb->ki_pos;
 	struct inode *inode = file_inode(file);
-	loff_t i_size = inode->i_size;
+	loff_t i_size = i_size_read(inode);
 	struct address_space *mapping = inode->i_mapping;
 	struct ntfs_inode *ni = ntfs_i(inode);
 	u64 valid = ni->i_valid;
@@ -1028,6 +1054,8 @@ out:
 	iocb->ki_pos += written;
 	if (iocb->ki_pos > ni->i_valid)
 		ni->i_valid = iocb->ki_pos;
+	if (iocb->ki_pos > i_size)
+		i_size_write(inode, iocb->ki_pos);
 
 	return written;
 }
@@ -1041,8 +1069,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
+	int err;
 	struct ntfs_inode *ni = ntfs_i(inode);
 
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
 	if (is_encrypted(ni)) {
 		ntfs_inode_warn(inode, "encrypted i/o not supported");
 		return -EOPNOTSUPP;
@@ -1068,6 +1100,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret <= 0)
 		goto out;
 
+	err = file_modified(iocb->ki_filp);
+	if (err) {
+		ret = err;
+		goto out;
+	}
+
 	if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
 		/* Should never be here, see ntfs_file_open(). */
 		ret = -EOPNOTSUPP;
@@ -1097,6 +1135,9 @@ int ntfs_file_open(struct inode *inode, struct file *file)
 {
 	struct ntfs_inode *ni = ntfs_i(inode);
 
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
 	if (unlikely((is_compressed(ni) || is_encrypted(ni)) &&
 		     (file->f_flags & O_DIRECT))) {
 		return -EOPNOTSUPP;
@@ -1138,7 +1179,8 @@ static int ntfs_file_release(struct inode *inode, struct file *file)
 		down_write(&ni->file.run_lock);
 
 		err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run,
-				    inode->i_size, &ni->i_valid, false, NULL);
+				    i_size_read(inode), &ni->i_valid, false,
+				    NULL);
 
 		up_write(&ni->file.run_lock);
 		ni_unlock(ni);
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 3df2d9e34b91..7f27382e0ce2 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -778,7 +778,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
 	run_deallocate(sbi, &ni->attr_list.run, true);
 	run_close(&ni->attr_list.run);
 	ni->attr_list.size = 0;
-	kfree(ni->attr_list.le);
+	kvfree(ni->attr_list.le);
 	ni->attr_list.le = NULL;
 	ni->attr_list.dirty = false;
 
@@ -927,7 +927,7 @@ int ni_create_attr_list(struct ntfs_inode *ni)
 	return 0;
 
 out:
-	kfree(ni->attr_list.le);
+	kvfree(ni->attr_list.le);
 	ni->attr_list.le = NULL;
 	ni->attr_list.size = 0;
 	return err;
@@ -2099,7 +2099,7 @@ int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page)
 	gfp_t gfp_mask;
 	struct page *pg;
 
-	if (vbo >= ni->vfs_inode.i_size) {
+	if (vbo >= i_size_read(&ni->vfs_inode)) {
 		SetPageUptodate(page);
 		err = 0;
 		goto out;
@@ -2173,7 +2173,7 @@ int ni_decompress_file(struct ntfs_inode *ni)
 {
 	struct ntfs_sb_info *sbi = ni->mi.sbi;
 	struct inode *inode = &ni->vfs_inode;
-	loff_t i_size = inode->i_size;
+	loff_t i_size = i_size_read(inode);
 	struct address_space *mapping = inode->i_mapping;
 	gfp_t gfp_mask = mapping_gfp_mask(mapping);
 	struct page **pages = NULL;
@@ -2508,6 +2508,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
 		err = -EOPNOTSUPP;
 		goto out1;
 #else
+		loff_t i_size = i_size_read(&ni->vfs_inode);
 		u32 frame_bits = ni_ext_compress_bits(ni);
 		u64 frame64 = frame_vbo >> frame_bits;
 		u64 frames, vbo_data;
@@ -2548,7 +2549,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
 			}
 		}
 
-		frames = (ni->vfs_inode.i_size - 1) >> frame_bits;
+		frames = (i_size - 1) >> frame_bits;
 
 		err = attr_wof_frame_info(ni, attr, run, frame64, frames,
 					  frame_bits, &ondisk_size, &vbo_data);
@@ -2556,8 +2557,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
 			goto out2;
 
 		if (frame64 == frames) {
-			unc_size = 1 + ((ni->vfs_inode.i_size - 1) &
-					(frame_size - 1));
+			unc_size = 1 + ((i_size - 1) & (frame_size - 1));
 			ondisk_size = attr_size(attr) - vbo_data;
 		} else {
 			unc_size = frame_size;
@@ -3259,6 +3259,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
 	if (is_bad_inode(inode) || sb_rdonly(sb))
 		return 0;
 
+	if (unlikely(ntfs3_forced_shutdown(sb)))
+		return -EIO;
+
 	if (!ni_trylock(ni)) {
 		/* 'ni' is under modification, skip for now. */
 		mark_inode_dirty_sync(inode);
@@ -3288,7 +3291,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
 			modified = true;
 		}
 
-		ts = inode_get_mtime(inode);
+		ts = inode_get_ctime(inode);
 		dup.c_time = kernel2nt(&ts);
 		if (std->c_time != dup.c_time) {
 			std->c_time = dup.c_time;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 98ccb6650858..855519713bf7 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -465,7 +465,7 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr)
 {
 	const struct RESTART_AREA *ra;
 	u16 cl, fl, ul;
-	u32 off, l_size, file_dat_bits, file_size_round;
+	u32 off, l_size, seq_bits;
 	u16 ro = le16_to_cpu(rhdr->ra_off);
 	u32 sys_page = le32_to_cpu(rhdr->sys_page_size);
 
@@ -511,13 +511,15 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr)
 	/* Make sure the sequence number bits match the log file size. */
 	l_size = le64_to_cpu(ra->l_size);
 
-	file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits);
-	file_size_round = 1u << (file_dat_bits + 3);
-	if (file_size_round != l_size &&
-	    (file_size_round < l_size || (file_size_round / 2) > l_size)) {
-		return false;
+	seq_bits = sizeof(u64) * 8 + 3;
+	while (l_size) {
+		l_size >>= 1;
+		seq_bits -= 1;
 	}
 
+	if (seq_bits != ra->seq_num_bits)
+		return false;
+
 	/* The log page data offset and record header length must be quad-aligned. */
 	if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) ||
 	    !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8))
@@ -974,6 +976,16 @@ skip_looking:
 	return e;
 }
 
+struct restart_info {
+	u64 last_lsn;
+	struct RESTART_HDR *r_page;
+	u32 vbo;
+	bool chkdsk_was_run;
+	bool valid_page;
+	bool initialized;
+	bool restart;
+};
+
 #define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001)
 
 #define NTFSLOG_WRAPPED 0x00000001
@@ -987,6 +999,7 @@ struct ntfs_log {
 	struct ntfs_inode *ni;
 
 	u32 l_size;
+	u32 orig_file_size;
 	u32 sys_page_size;
 	u32 sys_page_mask;
 	u32 page_size;
@@ -1040,6 +1053,8 @@ struct ntfs_log {
 
 	struct CLIENT_ID client_id;
 	u32 client_undo_commit;
+
+	struct restart_info rst_info, rst_info2;
 };
 
 static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn)
@@ -1105,16 +1120,6 @@ static inline bool verify_client_lsn(struct ntfs_log *log,
 	       lsn <= le64_to_cpu(log->ra->current_lsn) && lsn;
 }
 
-struct restart_info {
-	u64 last_lsn;
-	struct RESTART_HDR *r_page;
-	u32 vbo;
-	bool chkdsk_was_run;
-	bool valid_page;
-	bool initialized;
-	bool restart;
-};
-
 static int read_log_page(struct ntfs_log *log, u32 vbo,
 			 struct RECORD_PAGE_HDR **buffer, bool *usa_error)
 {
@@ -1176,7 +1181,7 @@ out:
  * restart page header. It will stop the first time we find a
  * valid page header.
  */
-static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
+static int log_read_rst(struct ntfs_log *log, bool first,
 			struct restart_info *info)
 {
 	u32 skip, vbo;
@@ -1192,7 +1197,7 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
 	}
 
 	/* Loop continuously until we succeed. */
-	for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) {
+	for (; vbo < log->l_size; vbo = 2 * vbo + skip, skip = 0) {
 		bool usa_error;
 		bool brst, bchk;
 		struct RESTART_AREA *ra;
@@ -1285,22 +1290,17 @@ check_result:
 /*
  * Ilog_init_pg_hdr - Init @log from restart page header.
  */
-static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size,
-			    u32 page_size, u16 major_ver, u16 minor_ver)
+static void log_init_pg_hdr(struct ntfs_log *log, u16 major_ver, u16 minor_ver)
 {
-	log->sys_page_size = sys_page_size;
-	log->sys_page_mask = sys_page_size - 1;
-	log->page_size = page_size;
-	log->page_mask = page_size - 1;
-	log->page_bits = blksize_bits(page_size);
+	log->sys_page_size = log->page_size;
+	log->sys_page_mask = log->page_mask;
 
 	log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits;
 	if (!log->clst_per_page)
 		log->clst_per_page = 1;
 
-	log->first_page = major_ver >= 2 ?
-				  0x22 * page_size :
-				  ((sys_page_size << 1) + (page_size << 1));
+	log->first_page = major_ver >= 2 ? 0x22 * log->page_size :
+					   4 * log->page_size;
 	log->major_ver = major_ver;
 	log->minor_ver = minor_ver;
 }
@@ -1308,12 +1308,11 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size,
 /*
  * log_create - Init @log in cases when we don't have a restart area to use.
  */
-static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn,
+static void log_create(struct ntfs_log *log, const u64 last_lsn,
 		       u32 open_log_count, bool wrapped, bool use_multi_page)
 {
-	log->l_size = l_size;
 	/* All file offsets must be quadword aligned. */
-	log->file_data_bits = blksize_bits(l_size) - 3;
+	log->file_data_bits = blksize_bits(log->l_size) - 3;
 	log->seq_num_mask = (8 << log->file_data_bits) - 1;
 	log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits;
 	log->seq_num = (last_lsn >> log->file_data_bits) + 2;
@@ -3720,10 +3719,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
 	struct ntfs_sb_info *sbi = ni->mi.sbi;
 	struct ntfs_log *log;
 
-	struct restart_info rst_info, rst_info2;
-	u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0;
+	u64 rec_lsn, checkpt_lsn = 0, rlsn = 0;
 	struct ATTR_NAME_ENTRY *attr_names = NULL;
-	struct ATTR_NAME_ENTRY *ane;
 	struct RESTART_TABLE *dptbl = NULL;
 	struct RESTART_TABLE *trtbl = NULL;
 	const struct RESTART_TABLE *rt;
@@ -3741,9 +3738,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
 	struct TRANSACTION_ENTRY *tr;
 	struct DIR_PAGE_ENTRY *dp;
 	u32 i, bytes_per_attr_entry;
-	u32 l_size = ni->vfs_inode.i_size;
-	u32 orig_file_size = l_size;
-	u32 page_size, vbo, tail, off, dlen;
+	u32 vbo, tail, off, dlen;
 	u32 saved_len, rec_len, transact_id;
 	bool use_second_page;
 	struct RESTART_AREA *ra2, *ra = NULL;
@@ -3758,52 +3753,50 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
 	u16 t16;
 	u32 t32;
 
-	/* Get the size of page. NOTE: To replay we can use default page. */
-#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2
-	page_size = norm_file_page(PAGE_SIZE, &l_size, true);
-#else
-	page_size = norm_file_page(PAGE_SIZE, &l_size, false);
-#endif
-	if (!page_size)
-		return -EINVAL;
-
 	log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS);
 	if (!log)
 		return -ENOMEM;
 
 	log->ni = ni;
-	log->l_size = l_size;
-	log->one_page_buf = kmalloc(page_size, GFP_NOFS);
+	log->l_size = log->orig_file_size = ni->vfs_inode.i_size;
 
+	/* Get the size of page. NOTE: To replay we can use default page. */
+#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2
+	log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, true);
+#else
+	log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, false);
+#endif
+	if (!log->page_size) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	log->one_page_buf = kmalloc(log->page_size, GFP_NOFS);
 	if (!log->one_page_buf) {
 		err = -ENOMEM;
 		goto out;
 	}
 
-	log->page_size = page_size;
-	log->page_mask = page_size - 1;
-	log->page_bits = blksize_bits(page_size);
+	log->page_mask = log->page_size - 1;
+	log->page_bits = blksize_bits(log->page_size);
 
 	/* Look for a restart area on the disk. */
-	memset(&rst_info, 0, sizeof(struct restart_info));
-	err = log_read_rst(log, l_size, true, &rst_info);
+	err = log_read_rst(log, true, &log->rst_info);
 	if (err)
 		goto out;
 
 	/* remember 'initialized' */
-	*initialized = rst_info.initialized;
+	*initialized = log->rst_info.initialized;
 
-	if (!rst_info.restart) {
-		if (rst_info.initialized) {
+	if (!log->rst_info.restart) {
+		if (log->rst_info.initialized) {
 			/* No restart area but the file is not initialized. */
 			err = -EINVAL;
 			goto out;
 		}
 
-		log_init_pg_hdr(log, page_size, page_size, 1, 1);
-		log_create(log, l_size, 0, get_random_u32(), false, false);
-
-		log->ra = ra;
+		log_init_pg_hdr(log, 1, 1);
+		log_create(log, 0, get_random_u32(), false, false);
 
 		ra = log_create_ra(log);
 		if (!ra) {
@@ -3820,25 +3813,26 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
 	 * If the restart offset above wasn't zero then we won't
 	 * look for a second restart.
 	 */
-	if (rst_info.vbo)
+	if (log->rst_info.vbo)
 		goto check_restart_area;
 
-	memset(&rst_info2, 0, sizeof(struct restart_info));
-	err = log_read_rst(log, l_size, false, &rst_info2);
+	err = log_read_rst(log, false, &log->rst_info2);
 	if (err)
 		goto out;
 
 	/* Determine which restart area to use. */
-	if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn)
+	if (!log->rst_info2.restart ||
+	    log->rst_info2.last_lsn <= log->rst_info.last_lsn)
 		goto use_first_page;
 
 	use_second_page = true;
 
-	if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) {
+	if (log->rst_info.chkdsk_was_run &&
+	    log->page_size != log->rst_info.vbo) {
 		struct RECORD_PAGE_HDR *sp = NULL;
 		bool usa_error;
 
-		if (!read_log_page(log, page_size, &sp, &usa_error) &&
+		if (!read_log_page(log, log->page_size, &sp, &usa_error) &&
 		    sp->rhdr.sign == NTFS_CHKD_SIGNATURE) {
 			use_second_page = false;
 		}
@@ -3846,52 +3840,43 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
 	}
 
 	if (use_second_page) {
-		kfree(rst_info.r_page);
-		memcpy(&rst_info, &rst_info2, sizeof(struct restart_info));
-		rst_info2.r_page = NULL;
+		kfree(log->rst_info.r_page);
+		memcpy(&log->rst_info, &log->rst_info2,
+		       sizeof(struct restart_info));
+		log->rst_info2.r_page = NULL;
 	}
 
 use_first_page:
-	kfree(rst_info2.r_page);
+	kfree(log->rst_info2.r_page);
 
 check_restart_area:
 	/*
 	 * If the restart area is at offset 0, we want
 	 * to write the second restart area first.
 	 */
-	log->init_ra = !!rst_info.vbo;
+	log->init_ra = !!log->rst_info.vbo;
 
 	/* If we have a valid page then grab a pointer to the restart area. */
-	ra2 = rst_info.valid_page ?
-		      Add2Ptr(rst_info.r_page,
-			      le16_to_cpu(rst_info.r_page->ra_off)) :
+	ra2 = log->rst_info.valid_page ?
+		      Add2Ptr(log->rst_info.r_page,
+			      le16_to_cpu(log->rst_info.r_page->ra_off)) :
 		      NULL;
 
-	if (rst_info.chkdsk_was_run ||
+	if (log->rst_info.chkdsk_was_run ||
 	    (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) {
 		bool wrapped = false;
 		bool use_multi_page = false;
 		u32 open_log_count;
 
 		/* Do some checks based on whether we have a valid log page. */
-		if (!rst_info.valid_page) {
-			open_log_count = get_random_u32();
-			goto init_log_instance;
-		}
-		open_log_count = le32_to_cpu(ra2->open_log_count);
-
-		/*
-		 * If the restart page size isn't changing then we want to
-		 * check how much work we need to do.
-		 */
-		if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size))
-			goto init_log_instance;
+		open_log_count = log->rst_info.valid_page ?
+					 le32_to_cpu(ra2->open_log_count) :
+					 get_random_u32();
 
-init_log_instance:
-		log_init_pg_hdr(log, page_size, page_size, 1, 1);
+		log_init_pg_hdr(log, 1, 1);
 
-		log_create(log, l_size, rst_info.last_lsn, open_log_count,
-			   wrapped, use_multi_page);
+		log_create(log, log->rst_info.last_lsn, open_log_count, wrapped,
+			   use_multi_page);
 
 		ra = log_create_ra(log);
 		if (!ra) {
@@ -3916,28 +3901,27 @@ init_log_instance:
 	 * use the log file. We must use the system page size instead of the
 	 * default size if there is not a clean shutdown.
 	 */
-	t32 = le32_to_cpu(rst_info.r_page->sys_page_size);
-	if (page_size != t32) {
-		l_size = orig_file_size;
-		page_size =
-			norm_file_page(t32, &l_size, t32 == DefaultLogPageSize);
+	t32 = le32_to_cpu(log->rst_info.r_page->sys_page_size);
+	if (log->page_size != t32) {
+		log->l_size = log->orig_file_size;
+		log->page_size = norm_file_page(t32, &log->l_size,
+						t32 == DefaultLogPageSize);
 	}
 
-	if (page_size != t32 ||
-	    page_size != le32_to_cpu(rst_info.r_page->page_size)) {
+	if (log->page_size != t32 ||
+	    log->page_size != le32_to_cpu(log->rst_info.r_page->page_size)) {
 		err = -EINVAL;
 		goto out;
 	}
 
 	/* If the file size has shrunk then we won't mount it. */
-	if (l_size < le64_to_cpu(ra2->l_size)) {
+	if (log->l_size < le64_to_cpu(ra2->l_size)) {
 		err = -EINVAL;
 		goto out;
 	}
 
-	log_init_pg_hdr(log, page_size, page_size,
-			le16_to_cpu(rst_info.r_page->major_ver),
-			le16_to_cpu(rst_info.r_page->minor_ver));
+	log_init_pg_hdr(log, le16_to_cpu(log->rst_info.r_page->major_ver),
+			le16_to_cpu(log->rst_info.r_page->minor_ver));
 
 	log->l_size = le64_to_cpu(ra2->l_size);
 	log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits);
@@ -3945,7 +3929,7 @@ init_log_instance:
 	log->seq_num_mask = (8 << log->file_data_bits) - 1;
 	log->last_lsn = le64_to_cpu(ra2->current_lsn);
 	log->seq_num = log->last_lsn >> log->file_data_bits;
-	log->ra_off = le16_to_cpu(rst_info.r_page->ra_off);
+	log->ra_off = le16_to_cpu(log->rst_info.r_page->ra_off);
 	log->restart_size = log->sys_page_size - log->ra_off;
 	log->record_header_len = le16_to_cpu(ra2->rec_hdr_len);
 	log->ra_size = le16_to_cpu(ra2->ra_len);
@@ -4045,7 +4029,7 @@ find_oldest:
 	log->current_avail = current_log_avail(log);
 
 	/* Remember which restart area to write first. */
-	log->init_ra = rst_info.vbo;
+	log->init_ra = log->rst_info.vbo;
 
 process_log:
 	/* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */
@@ -4105,7 +4089,7 @@ process_log:
 	log->client_id.seq_num = cr->seq_num;
 	log->client_id.client_idx = client;
 
-	err = read_rst_area(log, &rst, &ra_lsn);
+	err = read_rst_area(log, &rst, &checkpt_lsn);
 	if (err)
 		goto out;
 
@@ -4114,9 +4098,8 @@ process_log:
 
 	bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28;
 
-	checkpt_lsn = le64_to_cpu(rst->check_point_start);
-	if (!checkpt_lsn)
-		checkpt_lsn = ra_lsn;
+	if (rst->check_point_start)
+		checkpt_lsn = le64_to_cpu(rst->check_point_start);
 
 	/* Allocate and Read the Transaction Table. */
 	if (!rst->transact_table_len)
@@ -4330,23 +4313,20 @@ check_attr_table:
 	lcb = NULL;
 
 check_attribute_names2:
-	if (!rst->attr_names_len)
-		goto trace_attribute_table;
-
-	ane = attr_names;
-	if (!oatbl)
-		goto trace_attribute_table;
-	while (ane->off) {
-		/* TODO: Clear table on exit! */
-		oe = Add2Ptr(oatbl, le16_to_cpu(ane->off));
-		t16 = le16_to_cpu(ane->name_bytes);
-		oe->name_len = t16 / sizeof(short);
-		oe->ptr = ane->name;
-		oe->is_attr_name = 2;
-		ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16);
-	}
-
-trace_attribute_table:
+	if (rst->attr_names_len && oatbl) {
+		struct ATTR_NAME_ENTRY *ane = attr_names;
+		while (ane->off) {
+			/* TODO: Clear table on exit! */
+			oe = Add2Ptr(oatbl, le16_to_cpu(ane->off));
+			t16 = le16_to_cpu(ane->name_bytes);
+			oe->name_len = t16 / sizeof(short);
+			oe->ptr = ane->name;
+			oe->is_attr_name = 2;
+			ane = Add2Ptr(ane,
+				      sizeof(struct ATTR_NAME_ENTRY) + t16);
+		}
+	}
+
 	/*
 	 * If the checkpt_lsn is zero, then this is a freshly
 	 * formatted disk and we have no work to do.
@@ -5189,7 +5169,7 @@ out:
 	kfree(oatbl);
 	kfree(dptbl);
 	kfree(attr_names);
-	kfree(rst_info.r_page);
+	kfree(log->rst_info.r_page);
 
 	kfree(ra);
 	kfree(log->one_page_buf);
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index fbfe21dbb425..ae2ef5c11868 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -853,7 +853,8 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
 	/*
 	 * sb can be NULL here. In this case sbi->flags should be 0 too.
 	 */
-	if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR))
+	if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR) ||
+	    unlikely(ntfs3_forced_shutdown(sb)))
 		return;
 
 	blocksize = sb->s_blocksize;
@@ -1006,6 +1007,30 @@ static inline __le32 security_hash(const void *sd, size_t bytes)
 	return cpu_to_le32(hash);
 }
 
+/*
+ * simple wrapper for sb_bread_unmovable.
+ */
+struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block)
+{
+	struct ntfs_sb_info *sbi = sb->s_fs_info;
+	struct buffer_head *bh;
+
+	if (unlikely(block >= sbi->volume.blocks)) {
+		/* prevent generic message "attempt to access beyond end of device" */
+		ntfs_err(sb, "try to read out of volume at offset 0x%llx",
+			 (u64)block << sb->s_blocksize_bits);
+		return NULL;
+	}
+
+	bh = sb_bread_unmovable(sb, block);
+	if (bh)
+		return bh;
+
+	ntfs_err(sb, "failed to read volume at offset 0x%llx",
+		 (u64)block << sb->s_blocksize_bits);
+	return NULL;
+}
+
 int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer)
 {
 	struct block_device *bdev = sb->s_bdev;
@@ -2128,8 +2153,8 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi,
 			if (le32_to_cpu(d_security->size) == new_sec_size &&
 			    d_security->key.hash == hash_key.hash &&
 			    !memcmp(d_security + 1, sd, size_sd)) {
-				*security_id = d_security->key.sec_id;
 				/* Such security already exists. */
+				*security_id = d_security->key.sec_id;
 				err = 0;
 				goto out;
 			}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index cf92b2433f7a..daabaad63aaf 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1462,7 +1462,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
 		goto out2;
 
 	if (in->name == I30_NAME) {
-		ni->vfs_inode.i_size = data_size;
+		i_size_write(&ni->vfs_inode, data_size);
 		inode_set_bytes(&ni->vfs_inode, alloc_size);
 	}
 
@@ -1544,7 +1544,7 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
 	}
 
 	if (in->name == I30_NAME)
-		ni->vfs_inode.i_size = data_size;
+		i_size_write(&ni->vfs_inode, data_size);
 
 	*vbn = bit << indx->idx2vbn_bits;
 
@@ -2090,7 +2090,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni,
 		return err;
 
 	if (in->name == I30_NAME)
-		ni->vfs_inode.i_size = new_data;
+		i_size_write(&ni->vfs_inode, new_data);
 
 	bpb = bitmap_size(bit);
 	if (bpb * 8 == nbits)
@@ -2576,7 +2576,7 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
 		err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
 				    &indx->alloc_run, 0, NULL, false, NULL);
 		if (in->name == I30_NAME)
-			ni->vfs_inode.i_size = 0;
+			i_size_write(&ni->vfs_inode, 0);
 
 		err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len,
 				     false, NULL);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 5e3d71374918..eb7a8c9fba01 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -345,9 +345,7 @@ next_attr:
 			inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer
 							    .PrintNameLength) /
 					sizeof(u16);
-
 			ni->i_valid = inode->i_size;
-
 			/* Clear directory bit. */
 			if (ni->ni_flags & NI_FLAG_DIR) {
 				indx_clear(&ni->dir);
@@ -412,7 +410,6 @@ end_enum:
 		goto out;
 
 	if (!is_match && name) {
-		/* Reuse rec as buffer for ascii name. */
 		err = -ENOENT;
 		goto out;
 	}
@@ -427,6 +424,7 @@ end_enum:
 
 	if (names != le16_to_cpu(rec->hard_links)) {
 		/* Correct minor error on the fly. Do not mark inode as dirty. */
+		ntfs_inode_warn(inode, "Correct links count -> %u.", names);
 		rec->hard_links = cpu_to_le16(names);
 		ni->mi.dirty = true;
 	}
@@ -653,9 +651,10 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
 			off = vbo & (PAGE_SIZE - 1);
 			folio_set_bh(bh, folio, off);
 
-			err = bh_read(bh, 0);
-			if (err < 0)
+			if (bh_read(bh, 0) < 0) {
+				err = -EIO;
 				goto out;
+			}
 			folio_zero_segment(folio, off + voff, off + block_size);
 		}
 	}
@@ -853,9 +852,13 @@ static int ntfs_resident_writepage(struct folio *folio,
 				   struct writeback_control *wbc, void *data)
 {
 	struct address_space *mapping = data;
-	struct ntfs_inode *ni = ntfs_i(mapping->host);
+	struct inode *inode = mapping->host;
+	struct ntfs_inode *ni = ntfs_i(inode);
 	int ret;
 
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
 	ni_lock(ni);
 	ret = attr_data_write_resident(ni, &folio->page);
 	ni_unlock(ni);
@@ -869,7 +872,12 @@ static int ntfs_resident_writepage(struct folio *folio,
 static int ntfs_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
-	if (is_resident(ntfs_i(mapping->host)))
+	struct inode *inode = mapping->host;
+
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
+	if (is_resident(ntfs_i(inode)))
 		return write_cache_pages(mapping, wbc, ntfs_resident_writepage,
 					 mapping);
 	return mpage_writepages(mapping, wbc, ntfs_get_block);
@@ -889,6 +897,9 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct ntfs_inode *ni = ntfs_i(inode);
 
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
 	*pagep = NULL;
 	if (is_resident(ni)) {
 		struct page *page =
@@ -974,7 +985,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
 		}
 
 		if (pos + err > inode->i_size) {
-			inode->i_size = pos + err;
+			i_size_write(inode, pos + err);
 			dirty = true;
 		}
 
@@ -1306,6 +1317,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
 		goto out1;
 	}
 
+	if (unlikely(ntfs3_forced_shutdown(sb))) {
+		err = -EIO;
+		goto out2;
+	}
+
 	/* Mark rw ntfs as dirty. it will be cleared at umount. */
 	ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
 
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index ee3093be5170..084d19d78397 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -181,6 +181,9 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct ntfs_inode *ni = ntfs_i(dir);
 	int err;
 
+	if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+		return -EIO;
+
 	ni_lock_dir(ni);
 
 	err = ntfs_unlink_inode(dir, dentry);
@@ -199,6 +202,9 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	u32 size = strlen(symname);
 	struct inode *inode;
 
+	if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+		return -EIO;
+
 	inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0,
 				  symname, size, NULL);
 
@@ -227,6 +233,9 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct ntfs_inode *ni = ntfs_i(dir);
 	int err;
 
+	if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+		return -EIO;
+
 	ni_lock_dir(ni);
 
 	err = ntfs_unlink_inode(dir, dentry);
@@ -264,6 +273,9 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
 		      1024);
 	static_assert(PATH_MAX >= 4 * 1024);
 
+	if (unlikely(ntfs3_forced_shutdown(sb)))
+		return -EIO;
+
 	if (flags & ~RENAME_NOREPLACE)
 		return -EINVAL;
 
@@ -419,7 +431,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	 * fnd contains tree's path to insert to.
 	 * If fnd is not NULL then dir is locked.
 	 */
-	inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni,
+	inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni,
 				  mode, 0, NULL, 0, fnd);
 	err = IS_ERR(inode) ? PTR_ERR(inode) :
 			      finish_open(file, dentry, ntfs_file_open);
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 86aecbb01a92..9c7478150a03 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -523,12 +523,10 @@ struct ATTR_LIST_ENTRY {
 	__le64 vcn;		// 0x08: Starting VCN of this attribute.
 	struct MFT_REF ref;	// 0x10: MFT record number with attribute.
 	__le16 id;		// 0x18: struct ATTRIB ID.
-	__le16 name[3];		// 0x1A: Just to align. To get real name can use bNameOffset.
+	__le16 name[];		// 0x1A: To get real name use name_off.
 
 }; // sizeof(0x20)
 
-static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20);
-
 static inline u32 le_size(u8 name_len)
 {
 	return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) +
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index f6706143d14b..79356fd29a14 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -61,6 +61,8 @@ enum utf16_endian;
 
 /* sbi->flags */
 #define NTFS_FLAGS_NODISCARD		0x00000001
+/* ntfs in shutdown state. */
+#define NTFS_FLAGS_SHUTDOWN_BIT		0x00000002  /* == 4*/
 /* Set when LogFile is replaying. */
 #define NTFS_FLAGS_LOG_REPLAYING	0x00000008
 /* Set when we changed first MFT's which copy must be updated in $MftMirr. */
@@ -226,7 +228,7 @@ struct ntfs_sb_info {
 	u64 maxbytes; // Maximum size for normal files.
 	u64 maxbytes_sparse; // Maximum size for sparse file.
 
-	u32 flags; // See NTFS_FLAGS_XXX.
+	unsigned long flags; // See NTFS_FLAGS_
 
 	CLST zone_max; // Maximum MFT zone length in clusters
 	CLST bad_clusters; // The count of marked bad clusters.
@@ -473,7 +475,7 @@ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn,
 int al_update(struct ntfs_inode *ni, int sync);
 static inline size_t al_aligned(size_t size)
 {
-	return (size + 1023) & ~(size_t)1023;
+	return size_add(size, 1023) & ~(size_t)1023;
 }
 
 /* Globals from bitfunc.c */
@@ -500,6 +502,8 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 int ntfs_file_open(struct inode *inode, struct file *file);
 int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len);
+long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg);
+long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg);
 extern const struct inode_operations ntfs_special_inode_operations;
 extern const struct inode_operations ntfs_file_inode_operations;
 extern const struct file_operations ntfs_file_operations;
@@ -584,6 +588,7 @@ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes);
 int log_replay(struct ntfs_inode *ni, bool *initialized);
 
 /* Globals from fsntfs.c */
+struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block);
 bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes);
 int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
 		       bool simple);
@@ -872,7 +877,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
 
 int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
 ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern const struct xattr_handler * const ntfs_xattr_handlers[];
+extern const struct xattr_handler *const ntfs_xattr_handlers[];
 
 int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
 void ntfs_get_wsl_perm(struct inode *inode);
@@ -999,6 +1004,11 @@ static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+static inline int ntfs3_forced_shutdown(struct super_block *sb)
+{
+	return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags);
+}
+
 /*
  * ntfs_up_cluster - Align up on cluster boundary.
  */
@@ -1025,19 +1035,6 @@ static inline u64 bytes_to_block(const struct super_block *sb, u64 size)
 	return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 }
 
-static inline struct buffer_head *ntfs_bread(struct super_block *sb,
-					     sector_t block)
-{
-	struct buffer_head *bh = sb_bread(sb, block);
-
-	if (bh)
-		return bh;
-
-	ntfs_err(sb, "failed to read volume at offset 0x%llx",
-		 (u64)block << sb->s_blocksize_bits);
-	return NULL;
-}
-
 static inline struct ntfs_inode *ntfs_i(struct inode *inode)
 {
 	return container_of(inode, struct ntfs_inode, vfs_inode);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 53629b1f65e9..6aa3a9d44df1 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -279,7 +279,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
 		if (t16 > asize)
 			return NULL;
 
-		if (t16 + le32_to_cpu(attr->res.data_size) > asize)
+		if (le32_to_cpu(attr->res.data_size) > asize - t16)
 			return NULL;
 
 		t32 = sizeof(short) * attr->name_len;
@@ -535,8 +535,20 @@ bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
 		return false;
 
 	if (ni && is_attr_indexed(attr)) {
-		le16_add_cpu(&ni->mi.mrec->hard_links, -1);
-		ni->mi.dirty = true;
+		u16 links = le16_to_cpu(ni->mi.mrec->hard_links);
+		struct ATTR_FILE_NAME *fname =
+			attr->type != ATTR_NAME ?
+				NULL :
+				resident_data_ex(attr,
+						 SIZEOF_ATTRIBUTE_FILENAME);
+		if (fname && fname->type == FILE_NAME_DOS) {
+			/* Do not decrease links count deleting DOS name. */
+		} else if (!links) {
+			/* minor error. Not critical. */
+		} else {
+			ni->mi.mrec->hard_links = cpu_to_le16(links - 1);
+			ni->mi.dirty = true;
+		}
 	}
 
 	used -= asize;
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 9153dffde950..cef5467fd928 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -122,13 +122,12 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...)
 
 	if (name) {
 		struct dentry *de = d_find_alias(inode);
-		const u32 name_len = ARRAY_SIZE(s_name_buf) - 1;
 
 		if (de) {
 			spin_lock(&de->d_lock);
-			snprintf(name, name_len, " \"%s\"", de->d_name.name);
+			snprintf(name, sizeof(s_name_buf), " \"%s\"",
+				 de->d_name.name);
 			spin_unlock(&de->d_lock);
-			name[name_len] = 0; /* To be sure. */
 		} else {
 			name[0] = 0;
 		}
@@ -625,7 +624,7 @@ static void ntfs3_free_sbi(struct ntfs_sb_info *sbi)
 {
 	kfree(sbi->new_rec);
 	kvfree(ntfs_put_shared(sbi->upcase));
-	kfree(sbi->def_table);
+	kvfree(sbi->def_table);
 	kfree(sbi->compress.lznt);
 #ifdef CONFIG_NTFS3_LZX_XPRESS
 	xpress_free_decompressor(sbi->compress.xpress);
@@ -715,6 +714,14 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
 }
 
 /*
+ * ntfs_shutdown - super_operations::shutdown
+ */
+static void ntfs_shutdown(struct super_block *sb)
+{
+	set_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags);
+}
+
+/*
  * ntfs_sync_fs - super_operations::sync_fs
  */
 static int ntfs_sync_fs(struct super_block *sb, int wait)
@@ -724,6 +731,9 @@ static int ntfs_sync_fs(struct super_block *sb, int wait)
 	struct ntfs_inode *ni;
 	struct inode *inode;
 
+	if (unlikely(ntfs3_forced_shutdown(sb)))
+		return -EIO;
+
 	ni = sbi->security.ni;
 	if (ni) {
 		inode = &ni->vfs_inode;
@@ -763,6 +773,7 @@ static const struct super_operations ntfs_sops = {
 	.put_super = ntfs_put_super,
 	.statfs = ntfs_statfs,
 	.show_options = ntfs_show_options,
+	.shutdown = ntfs_shutdown,
 	.sync_fs = ntfs_sync_fs,
 	.write_inode = ntfs3_write_inode,
 };
@@ -866,6 +877,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
 	u16 fn, ao;
 	u8 cluster_bits;
 	u32 boot_off = 0;
+	sector_t boot_block = 0;
 	const char *hint = "Primary boot";
 
 	/* Save original dev_size. Used with alternative boot. */
@@ -873,11 +885,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
 
 	sbi->volume.blocks = dev_size >> PAGE_SHIFT;
 
-	bh = ntfs_bread(sb, 0);
+read_boot:
+	bh = ntfs_bread(sb, boot_block);
 	if (!bh)
-		return -EIO;
+		return boot_block ? -EINVAL : -EIO;
 
-check_boot:
 	err = -EINVAL;
 
 	/* Corrupted image; do not read OOB */
@@ -1108,26 +1120,24 @@ check_boot:
 	}
 
 out:
-	if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) {
+	brelse(bh);
+
+	if (err == -EINVAL && !boot_block && dev_size0 > PAGE_SHIFT) {
 		u32 block_size = min_t(u32, sector_size, PAGE_SIZE);
 		u64 lbo = dev_size0 - sizeof(*boot);
 
-		/*
-	 	 * Try alternative boot (last sector)
-		 */
-		brelse(bh);
-
-		sb_set_blocksize(sb, block_size);
-		bh = ntfs_bread(sb, lbo >> blksize_bits(block_size));
-		if (!bh)
-			return -EINVAL;
-
+		boot_block = lbo >> blksize_bits(block_size);
 		boot_off = lbo & (block_size - 1);
-		hint = "Alternative boot";
-		dev_size = dev_size0; /* restore original size. */
-		goto check_boot;
+		if (boot_block && block_size >= boot_off + sizeof(*boot)) {
+			/*
+			 * Try alternative boot (last sector)
+			 */
+			sb_set_blocksize(sb, block_size);
+			hint = "Alternative boot";
+			dev_size = dev_size0; /* restore original size. */
+			goto read_boot;
+		}
 	}
-	brelse(bh);
 
 	return err;
 }
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 4274b6f31cfa..53e7d1fa036a 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -219,6 +219,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
 		if (!ea->name_len)
 			break;
 
+		if (ea->name_len > ea_size)
+			break;
+
 		if (buffer) {
 			/* Check if we can use field ea->name */
 			if (off + ea_size > size)
@@ -744,6 +747,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
 	int err;
 	struct ntfs_inode *ni = ntfs_i(inode);
 
+	if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
 	/* Dispatch request. */
 	if (!strcmp(name, SYSTEM_DOS_ATTRIB)) {
 		/* system.dos_attrib */
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index c4b65a6d41cc..4a0779e3ef79 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -446,7 +446,7 @@ static int __init init_openprom_fs(void)
 					    sizeof(struct op_inode_info),
 					    0,
 					    (SLAB_RECLAIM_ACCOUNT |
-					     SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+					     SLAB_ACCOUNT),
 					    op_inode_init_once);
 	if (!op_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index b8e25ca51016..8586e2f5d243 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -265,20 +265,18 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
 	if (IS_ERR(old_file))
 		return PTR_ERR(old_file);
 
+	/* Try to use clone_file_range to clone up within the same fs */
+	cloned = vfs_clone_file_range(old_file, 0, new_file, 0, len, 0);
+	if (cloned == len)
+		goto out_fput;
+
+	/* Couldn't clone, so now we try to copy the data */
 	error = rw_verify_area(READ, old_file, &old_pos, len);
 	if (!error)
 		error = rw_verify_area(WRITE, new_file, &new_pos, len);
 	if (error)
 		goto out_fput;
 
-	/* Try to use clone_file_range to clone up within the same fs */
-	ovl_start_write(dentry);
-	cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
-	ovl_end_write(dentry);
-	if (cloned == len)
-		goto out_fput;
-	/* Couldn't clone, so now we try to copy the data */
-
 	/* Check if lower fs supports seek operation */
 	if (old_file->f_mode & FMODE_LSEEK)
 		skip_hole = true;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 984ffdaeed6c..5764f91d283e 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -18,10 +18,11 @@
 
 struct ovl_lookup_data {
 	struct super_block *sb;
-	struct vfsmount *mnt;
+	const struct ovl_layer *layer;
 	struct qstr name;
 	bool is_dir;
 	bool opaque;
+	bool xwhiteouts;
 	bool stop;
 	bool last;
 	char *redirect;
@@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
 	return real;
 }
 
-static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path)
-{
-	return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
-}
-
 static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
 						   const char *name,
 						   struct dentry *base, int len,
 						   bool drop_negative)
 {
-	struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len);
+	struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name,
+						 base, len);
 
 	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
 		if (drop_negative && ret->d_lockref.count == 1) {
@@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
 			     size_t prelen, const char *post,
 			     struct dentry **ret, bool drop_negative)
 {
+	struct ovl_fs *ofs = OVL_FS(d->sb);
 	struct dentry *this;
 	struct path path;
 	int err;
 	bool last_element = !post[0];
+	bool is_upper = d->layer->idx == 0;
+	char val;
 
 	this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
 	if (IS_ERR(this)) {
@@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
 	}
 
 	path.dentry = this;
-	path.mnt = d->mnt;
-	if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) {
+	path.mnt = d->layer->mnt;
+	if (ovl_path_is_whiteout(ofs, &path)) {
 		d->stop = d->opaque = true;
 		goto put_and_out;
 	}
@@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
 			d->stop = true;
 			goto put_and_out;
 		}
-		err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL);
+		err = ovl_check_metacopy_xattr(ofs, &path, NULL);
 		if (err < 0)
 			goto out_err;
 
@@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
 		if (d->last)
 			goto out;
 
-		if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) {
+		/* overlay.opaque=x means xwhiteouts directory */
+		val = ovl_get_opaquedir_val(ofs, &path);
+		if (last_element && !is_upper && val == 'x') {
+			d->xwhiteouts = true;
+			ovl_layer_set_xwhiteouts(ofs, d->layer);
+		} else if (val == 'y') {
 			d->stop = true;
 			if (last_element)
 				d->opaque = true;
@@ -863,7 +868,8 @@ fail:
  * Returns next layer in stack starting from top.
  * Returns -1 if this is the last layer.
  */
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+		  const struct ovl_layer **layer)
 {
 	struct ovl_entry *oe = OVL_E(dentry);
 	struct ovl_path *lowerstack = ovl_lowerstack(oe);
@@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
 	BUG_ON(idx < 0);
 	if (idx == 0) {
 		ovl_path_upper(dentry, path);
-		if (path->dentry)
+		if (path->dentry) {
+			*layer = &OVL_FS(dentry->d_sb)->layers[0];
 			return ovl_numlower(oe) ? 1 : -1;
+		}
 		idx++;
 	}
 	BUG_ON(idx > ovl_numlower(oe));
 	path->dentry = lowerstack[idx - 1].dentry;
-	path->mnt = lowerstack[idx - 1].layer->mnt;
+	*layer = lowerstack[idx - 1].layer;
+	path->mnt = (*layer)->mnt;
 
 	return (idx < ovl_numlower(oe)) ? idx + 1 : -1;
 }
@@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 	old_cred = ovl_override_creds(dentry->d_sb);
 	upperdir = ovl_dentry_upper(dentry->d_parent);
 	if (upperdir) {
-		d.mnt = ovl_upper_mnt(ofs);
+		d.layer = &ofs->layers[0];
 		err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
 		if (err)
 			goto out;
@@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 		else if (d.is_dir || !ofs->numdatalayer)
 			d.last = lower.layer->idx == ovl_numlower(roe);
 
-		d.mnt = lower.layer->mnt;
+		d.layer = lower.layer;
 		err = ovl_lookup_layer(lower.dentry, &d, &this, false);
 		if (err)
 			goto out_put;
@@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 
 	if (upperopaque)
 		ovl_dentry_set_opaque(dentry);
+	if (d.xwhiteouts)
+		ovl_dentry_set_xwhiteouts(dentry);
 
 	if (upperdentry)
 		ovl_dentry_set_upper_alias(dentry);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5ba11eb43767..ee949f3e7c77 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -50,7 +50,6 @@ enum ovl_xattr {
 	OVL_XATTR_METACOPY,
 	OVL_XATTR_PROTATTR,
 	OVL_XATTR_XWHITEOUT,
-	OVL_XATTR_XWHITEOUTS,
 };
 
 enum ovl_inode_flag {
@@ -70,6 +69,8 @@ enum ovl_entry_flag {
 	OVL_E_UPPER_ALIAS,
 	OVL_E_OPAQUE,
 	OVL_E_CONNECTED,
+	/* Lower stack may contain xwhiteout entries */
+	OVL_E_XWHITEOUTS,
 };
 
 enum {
@@ -477,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry);
 bool ovl_dentry_is_opaque(struct dentry *dentry);
 bool ovl_dentry_is_whiteout(struct dentry *dentry);
 void ovl_dentry_set_opaque(struct dentry *dentry);
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry);
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry);
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+			      const struct ovl_layer *layer);
 bool ovl_dentry_has_upper_alias(struct dentry *dentry);
 void ovl_dentry_set_upper_alias(struct dentry *dentry);
 bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags);
@@ -494,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags);
 int ovl_copy_up_start(struct dentry *dentry, int flags);
 void ovl_copy_up_end(struct dentry *dentry);
 bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
-			      enum ovl_xattr ox);
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+			   enum ovl_xattr ox);
 bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
 bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path);
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path);
 bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
 			 const struct path *upperpath);
 
@@ -573,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb,
 		.mnt = ovl_upper_mnt(ofs),
 	};
 
-	return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE);
+	return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y';
+}
+
+static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs,
+					 const struct path *path)
+{
+	return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE);
 }
 
 static inline bool ovl_redirect_follow(struct ovl_fs *ofs)
@@ -680,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
 struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
 struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
 				struct dentry *origin, bool verify);
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+		  const struct ovl_layer **layer);
 int ovl_verify_lowerdata(struct dentry *dentry);
 struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			  unsigned int flags);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 5fa9c58af65f..cb449ab310a7 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -40,6 +40,8 @@ struct ovl_layer {
 	int idx;
 	/* One fsid per unique underlying sb (upper fsid == 0) */
 	int fsid;
+	/* xwhiteouts were found on this layer */
+	bool has_xwhiteouts;
 };
 
 struct ovl_path {
@@ -59,7 +61,7 @@ struct ovl_fs {
 	unsigned int numfs;
 	/* Number of data-only lower layers */
 	unsigned int numdatalayer;
-	const struct ovl_layer *layers;
+	struct ovl_layer *layers;
 	struct ovl_sb *fs;
 	/* workbasedir is the path at workdir= mount option */
 	struct dentry *workbasedir;
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 112b4b12f825..36dcc530ac28 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -280,12 +280,20 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
 {
 	struct ovl_fs_context *ctx = fc->fs_private;
 
-	if (ovl_dentry_weird(path->dentry))
-		return invalfc(fc, "filesystem on %s not supported", name);
-
 	if (!d_is_dir(path->dentry))
 		return invalfc(fc, "%s is not a directory", name);
 
+	/*
+	 * Root dentries of case-insensitive capable filesystems might
+	 * not have the dentry operations set, but still be incompatible
+	 * with overlayfs.  Check explicitly to prevent post-mount
+	 * failures.
+	 */
+	if (sb_has_encoding(path->mnt->mnt_sb))
+		return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name);
+
+	if (ovl_dentry_weird(path->dentry))
+		return invalfc(fc, "filesystem on %s not supported", name);
 
 	/*
 	 * Check whether upper path is read-only here to report failures
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index e71156baa7bc..0ca8af060b0c 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath,
 	if (IS_ERR(realfile))
 		return PTR_ERR(realfile);
 
-	rdd->in_xwhiteouts_dir = rdd->dentry &&
-		ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath);
 	rdd->first_maybe_whiteout = NULL;
 	rdd->ctx.pos = 0;
 	do {
@@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
 		.is_lowest = false,
 	};
 	int idx, next;
+	const struct ovl_layer *layer;
 
 	for (idx = 0; idx != -1; idx = next) {
-		next = ovl_path_next(idx, dentry, &realpath);
+		next = ovl_path_next(idx, dentry, &realpath, &layer);
 		rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
+		rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
+					ovl_dentry_has_xwhiteouts(dentry);
 
 		if (next != -1) {
 			err = ovl_dir_read(&realpath, &rdd);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 4ab66e3d4cff..36d4b8b1f784 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -28,41 +28,38 @@ MODULE_LICENSE("GPL");
 
 struct ovl_dir_cache;
 
-static struct dentry *ovl_d_real(struct dentry *dentry,
-				 const struct inode *inode)
+static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type)
 {
-	struct dentry *real = NULL, *lower;
+	struct dentry *upper, *lower;
 	int err;
 
-	/*
-	 * vfs is only expected to call d_real() with NULL from d_real_inode()
-	 * and with overlay inode from file_dentry() on an overlay file.
-	 *
-	 * TODO: remove @inode argument from d_real() API, remove code in this
-	 * function that deals with non-NULL @inode and remove d_real() call
-	 * from file_dentry().
-	 */
-	if (inode && d_inode(dentry) == inode)
-		return dentry;
-	else if (inode)
+	switch (type) {
+	case D_REAL_DATA:
+	case D_REAL_METADATA:
+		break;
+	default:
 		goto bug;
+	}
 
 	if (!d_is_reg(dentry)) {
 		/* d_real_inode() is only relevant for regular files */
 		return dentry;
 	}
 
-	real = ovl_dentry_upper(dentry);
-	if (real && (inode == d_inode(real)))
-		return real;
+	upper = ovl_dentry_upper(dentry);
+	if (upper && (type == D_REAL_METADATA ||
+		      ovl_has_upperdata(d_inode(dentry))))
+		return upper;
 
-	if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
-		return real;
+	if (type == D_REAL_METADATA) {
+		lower = ovl_dentry_lower(dentry);
+		goto real_lower;
+	}
 
 	/*
-	 * Best effort lazy lookup of lowerdata for !inode case to return
+	 * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return
 	 * the real lowerdata dentry.  The only current caller of d_real() with
-	 * NULL inode is d_real_inode() from trace_uprobe and this caller is
+	 * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is
 	 * likely going to be followed reading from the file, before placing
 	 * uprobes on offset within the file, so lowerdata should be available
 	 * when setting the uprobe.
@@ -73,18 +70,13 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	lower = ovl_dentry_lowerdata(dentry);
 	if (!lower)
 		goto bug;
-	real = lower;
 
-	/* Handle recursion */
-	real = d_real(real, inode);
+real_lower:
+	/* Handle recursion into stacked lower fs */
+	return d_real(lower, type);
 
-	if (!inode || inode == d_inode(real))
-		return real;
 bug:
-	WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n",
-	     __func__, dentry, inode ? inode->i_sb->s_id : "NULL",
-	     inode ? inode->i_ino : 0, real,
-	     real && d_inode(real) ? d_inode(real)->i_ino : 0);
+	WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type);
 	return dentry;
 }
 
@@ -1249,6 +1241,7 @@ static struct dentry *ovl_get_root(struct super_block *sb,
 				   struct ovl_entry *oe)
 {
 	struct dentry *root;
+	struct ovl_fs *ofs = OVL_FS(sb);
 	struct ovl_path *lowerpath = ovl_lowerstack(oe);
 	unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
 	int fsid = lowerpath->layer->fsid;
@@ -1270,6 +1263,20 @@ static struct dentry *ovl_get_root(struct super_block *sb,
 			ovl_set_flag(OVL_IMPURE, d_inode(root));
 	}
 
+	/* Look for xwhiteouts marker except in the lowermost layer */
+	for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) {
+		struct path path = {
+			.mnt = lowerpath->layer->mnt,
+			.dentry = lowerpath->dentry,
+		};
+
+		/* overlay.opaque=x means xwhiteouts directory */
+		if (ovl_get_opaquedir_val(ofs, &path) == 'x') {
+			ovl_layer_set_xwhiteouts(ofs, lowerpath->layer);
+			ovl_dentry_set_xwhiteouts(root);
+		}
+	}
+
 	/* Root is always merge -> can have whiteouts */
 	ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
 	ovl_dentry_set_flag(OVL_E_CONNECTED, root);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 0217094c23ea..a8e17f14d7a2 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry)
 	ovl_dentry_set_flag(OVL_E_OPAQUE, dentry);
 }
 
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry)
+{
+	return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry)
+{
+	ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+/*
+ * ovl_layer_set_xwhiteouts() is called before adding the overlay dir
+ * dentry to dcache, while readdir of that same directory happens after
+ * the overlay dir dentry is in dcache, so if some cpu observes that
+ * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts
+ * for the layers where xwhiteouts marker was found in that merge dir.
+ */
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+			      const struct ovl_layer *layer)
+{
+	if (layer->has_xwhiteouts)
+		return;
+
+	/* Write once to read-mostly layer properties */
+	ofs->layers[layer->idx].has_xwhiteouts = true;
+}
+
 /*
  * For hard links and decoded file handles, it's possible for ovl_dentry_upper()
  * to return positive, while there's no actual upper alias for the inode.
@@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path)
 	return res >= 0;
 }
 
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path)
-{
-	struct dentry *dentry = path->dentry;
-	int res;
-
-	/* xattr.whiteouts must be a directory */
-	if (!d_is_dir(dentry))
-		return false;
-
-	res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0);
-	return res >= 0;
-}
-
 /*
  * Load persistent uuid from xattr into s_uuid if found, or store a new
  * random generated value in s_uuid and in xattr.
@@ -811,20 +825,17 @@ fail:
 	return false;
 }
 
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
-			       enum ovl_xattr ox)
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+			   enum ovl_xattr ox)
 {
 	int res;
 	char val;
 
 	if (!d_is_dir(path->dentry))
-		return false;
+		return 0;
 
 	res = ovl_path_getxattr(ofs, path, ox, &val, 1);
-	if (res == 1 && val == 'y')
-		return true;
-
-	return false;
+	return res == 1 ? val : 0;
 }
 
 #define OVL_XATTR_OPAQUE_POSTFIX	"opaque"
@@ -837,7 +848,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
 #define OVL_XATTR_METACOPY_POSTFIX	"metacopy"
 #define OVL_XATTR_PROTATTR_POSTFIX	"protattr"
 #define OVL_XATTR_XWHITEOUT_POSTFIX	"whiteout"
-#define OVL_XATTR_XWHITEOUTS_POSTFIX	"whiteouts"
 
 #define OVL_XATTR_TAB_ENTRY(x) \
 	[x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
@@ -854,7 +864,6 @@ const char *const ovl_xattr_table[][2] = {
 	OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
 	OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
 	OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT),
-	OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS),
 };
 
 int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
diff --git a/fs/pidfs.c b/fs/pidfs.c
new file mode 100644
index 000000000000..8fd71a00be9c
--- /dev/null
+++ b/fs/pidfs.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/mount.h>
+#include <linux/pid.h>
+#include <linux/pidfs.h>
+#include <linux/pid_namespace.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/pseudo_fs.h>
+#include <linux/seq_file.h>
+#include <uapi/linux/pidfd.h>
+
+#include "internal.h"
+
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+#ifndef CONFIG_FS_PID
+	struct pid *pid = file->private_data;
+
+	file->private_data = NULL;
+	put_pid(pid);
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the  pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ *   namespace (also take care to create new mount namespaces in the
+ *   new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ *   have exactly one entry, which is 0
+ */
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct pid *pid = pidfd_pid(f);
+	struct pid_namespace *ns;
+	pid_t nr = -1;
+
+	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+		ns = proc_pid_ns(file_inode(m->file)->i_sb);
+		nr = pid_nr_ns(pid, ns);
+	}
+
+	seq_put_decimal_ll(m, "Pid:\t", nr);
+
+#ifdef CONFIG_PID_NS
+	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+	if (nr > 0) {
+		int i;
+
+		/* If nr is non-zero it means that 'pid' is valid and that
+		 * ns, i.e. the pid namespace associated with the procfs
+		 * instance, is in the pid namespace hierarchy of pid.
+		 * Start at one below the already printed level.
+		 */
+		for (i = ns->level + 1; i <= pid->level; i++)
+			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+	}
+#endif
+	seq_putc(m, '\n');
+}
+#endif
+
+/*
+ * Poll support for process exit notification.
+ */
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+	struct pid *pid = pidfd_pid(file);
+	bool thread = file->f_flags & PIDFD_THREAD;
+	struct task_struct *task;
+	__poll_t poll_flags = 0;
+
+	poll_wait(file, &pid->wait_pidfd, pts);
+	/*
+	 * Depending on PIDFD_THREAD, inform pollers when the thread
+	 * or the whole thread-group exits.
+	 */
+	guard(rcu)();
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
+	else if (task->exit_state && (thread || thread_group_empty(task)))
+		poll_flags = EPOLLIN | EPOLLRDNORM;
+
+	return poll_flags;
+}
+
+static const struct file_operations pidfs_file_operations = {
+	.release	= pidfd_release,
+	.poll		= pidfd_poll,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= pidfd_show_fdinfo,
+#endif
+};
+
+struct pid *pidfd_pid(const struct file *file)
+{
+	if (file->f_op != &pidfs_file_operations)
+		return ERR_PTR(-EBADF);
+#ifdef CONFIG_FS_PID
+	return file_inode(file)->i_private;
+#else
+	return file->private_data;
+#endif
+}
+
+#ifdef CONFIG_FS_PID
+static struct vfsmount *pidfs_mnt __ro_after_init;
+
+/*
+ * The vfs falls back to simple_setattr() if i_op->setattr() isn't
+ * implemented. Let's reject it completely until we have a clean
+ * permission concept for pidfds.
+ */
+static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+			 struct iattr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+			 struct kstat *stat, u32 request_mask,
+			 unsigned int query_flags)
+{
+	struct inode *inode = d_inode(path->dentry);
+
+	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+	return 0;
+}
+
+static const struct inode_operations pidfs_inode_operations = {
+	.getattr = pidfs_getattr,
+	.setattr = pidfs_setattr,
+};
+
+static void pidfs_evict_inode(struct inode *inode)
+{
+	struct pid *pid = inode->i_private;
+
+	clear_inode(inode);
+	put_pid(pid);
+}
+
+static const struct super_operations pidfs_sops = {
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= pidfs_evict_inode,
+	.statfs		= simple_statfs,
+};
+
+static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(buffer, buflen, "pidfd:[%lu]",
+			     d_inode(dentry)->i_ino);
+}
+
+static const struct dentry_operations pidfs_dentry_operations = {
+	.d_delete	= always_delete_dentry,
+	.d_dname	= pidfs_dname,
+	.d_prune	= stashed_dentry_prune,
+};
+
+static void pidfs_init_inode(struct inode *inode, void *data)
+{
+	inode->i_private = data;
+	inode->i_flags |= S_PRIVATE;
+	inode->i_mode |= S_IRWXU;
+	inode->i_op = &pidfs_inode_operations;
+	inode->i_fop = &pidfs_file_operations;
+}
+
+static void pidfs_put_data(void *data)
+{
+	struct pid *pid = data;
+	put_pid(pid);
+}
+
+static const struct stashed_operations pidfs_stashed_ops = {
+	.init_inode = pidfs_init_inode,
+	.put_data = pidfs_put_data,
+};
+
+static int pidfs_init_fs_context(struct fs_context *fc)
+{
+	struct pseudo_fs_context *ctx;
+
+	ctx = init_pseudo(fc, PID_FS_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->ops = &pidfs_sops;
+	ctx->dops = &pidfs_dentry_operations;
+	fc->s_fs_info = (void *)&pidfs_stashed_ops;
+	return 0;
+}
+
+static struct file_system_type pidfs_type = {
+	.name			= "pidfs",
+	.init_fs_context	= pidfs_init_fs_context,
+	.kill_sb		= kill_anon_super,
+};
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+
+	struct file *pidfd_file;
+	struct path path;
+	int ret;
+
+	/*
+	* Inode numbering for pidfs start at RESERVED_PIDS + 1.
+	* This avoids collisions with the root inode which is 1
+	* for pseudo filesystems.
+	 */
+	ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
+				get_pid(pid), &path);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	pidfd_file = dentry_open(&path, flags, current_cred());
+	path_put(&path);
+	return pidfd_file;
+}
+
+void __init pidfs_init(void)
+{
+	pidfs_mnt = kern_mount(&pidfs_type);
+	if (IS_ERR(pidfs_mnt))
+		panic("Failed to mount pidfs pseudo filesystem");
+}
+
+bool is_pidfs_sb(const struct super_block *sb)
+{
+	return sb == pidfs_mnt->mnt_sb;
+}
+
+#else /* !CONFIG_FS_PID */
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+	struct file *pidfd_file;
+
+	pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid,
+					flags | O_RDWR);
+	if (IS_ERR(pidfd_file))
+		return pidfd_file;
+
+	get_pid(pid);
+	return pidfd_file;
+}
+
+void __init pidfs_init(void) { }
+bool is_pidfs_sb(const struct super_block *sb)
+{
+	return false;
+}
+#endif
diff --git a/fs/pipe.c b/fs/pipe.c
index f1adbfe743d4..50c8a8596b52 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  */
 
-static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+#define cmp_int(l, r)		((l > r) - (l < r))
+
+#ifdef CONFIG_PROVE_LOCKING
+static int pipe_lock_cmp_fn(const struct lockdep_map *a,
+			    const struct lockdep_map *b)
 {
-	if (pipe->files)
-		mutex_lock_nested(&pipe->mutex, subclass);
+	return cmp_int((unsigned long) a, (unsigned long) b);
 }
+#endif
 
 void pipe_lock(struct pipe_inode_info *pipe)
 {
-	/*
-	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
-	 */
-	pipe_lock_nested(pipe, I_MUTEX_PARENT);
+	if (pipe->files)
+		mutex_lock(&pipe->mutex);
 }
 EXPORT_SYMBOL(pipe_lock);
 
@@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe)
 }
 EXPORT_SYMBOL(pipe_unlock);
 
-static inline void __pipe_lock(struct pipe_inode_info *pipe)
-{
-	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
-}
-
-static inline void __pipe_unlock(struct pipe_inode_info *pipe)
-{
-	mutex_unlock(&pipe->mutex);
-}
-
 void pipe_double_lock(struct pipe_inode_info *pipe1,
 		      struct pipe_inode_info *pipe2)
 {
 	BUG_ON(pipe1 == pipe2);
 
-	if (pipe1 < pipe2) {
-		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
-		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
-	} else {
-		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
-		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
-	}
+	if (pipe1 > pipe2)
+		swap(pipe1, pipe2);
+
+	pipe_lock(pipe1);
+	pipe_lock(pipe2);
 }
 
 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		return 0;
 
 	ret = 0;
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	/*
 	 * We only wake up writers if the pipe was full when we started
@@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 			ret = -EAGAIN;
 			break;
 		}
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 
 		/*
 		 * We only get here if we didn't actually read anything.
@@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
 			return -ERESTARTSYS;
 
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
 		wake_next_reader = true;
 	}
 	if (pipe_empty(pipe->head, pipe->tail))
 		wake_next_reader = false;
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	if (was_full)
 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
@@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 	if (unlikely(total_len == 0))
 		return 0;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	if (!pipe->readers) {
 		send_sig(SIGPIPE, current, 0);
@@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		 * after waiting we need to re-check whether the pipe
 		 * become empty while we dropped the lock.
 		 */
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 		if (was_empty)
 			wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		was_empty = pipe_empty(pipe->head, pipe->tail);
 		wake_next_writer = true;
 	}
 out:
 	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 		wake_next_writer = false;
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	/*
 	 * If we do do a wakeup event, we do a 'sync' wakeup, because we
@@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case FIONREAD:
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		count = 0;
 		head = pipe->head;
 		tail = pipe->tail;
@@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			count += pipe->bufs[tail & mask].len;
 			tail++;
 		}
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 
 		return put_user(count, (int __user *)arg);
 
 #ifdef CONFIG_WATCH_QUEUE
 	case IOC_WATCH_QUEUE_SET_SIZE: {
 		int ret;
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		ret = watch_queue_set_size(pipe, arg);
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 		return ret;
 	}
 
@@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file)
 {
 	struct pipe_inode_info *pipe = file->private_data;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 	if (file->f_mode & FMODE_READ)
 		pipe->readers--;
 	if (file->f_mode & FMODE_WRITE)
@@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file)
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	put_pipe_info(inode, pipe);
 	return 0;
@@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on)
 	struct pipe_inode_info *pipe = filp->private_data;
 	int retval = 0;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 	if (filp->f_mode & FMODE_READ)
 		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
@@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on)
 			/* this can happen only if on == T */
 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 	}
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return retval;
 }
 
@@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
 		pipe->nr_accounted = pipe_bufs;
 		pipe->user = user;
 		mutex_init(&pipe->mutex);
+		lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
 		return pipe;
 	}
 
@@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	filp->private_data = pipe;
 	/* OK, we have a pipe and it's pinned down */
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	/* We can only do regular read/write on fifos */
 	stream_open(inode, filp);
@@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	}
 
 	/* Ok! */
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return 0;
 
 err_rd:
@@ -1230,7 +1221,7 @@ err_wr:
 	goto err;
 
 err:
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	put_pipe_info(inode, pipe);
 	return ret;
@@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 	if (!pipe)
 		return -EBADF;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	switch (cmd) {
 	case F_SETPIPE_SZ:
@@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 		break;
 	}
 
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return ret;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ff08a8957552..34a47fb0c57f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	int permitted;
 	struct mm_struct *mm;
 	unsigned long long start_time;
-	unsigned long cmin_flt = 0, cmaj_flt = 0;
-	unsigned long  min_flt = 0,  maj_flt = 0;
-	u64 cutime, cstime, utime, stime;
-	u64 cgtime, gtime;
+	unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
+	u64 cutime, cstime, cgtime, utime, stime, gtime;
 	unsigned long rsslim = 0;
 	unsigned long flags;
 	int exit_code = task->exit_code;
+	struct signal_struct *sig = task->signal;
+	unsigned int seq = 1;
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
@@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	sigemptyset(&sigign);
 	sigemptyset(&sigcatch);
-	cutime = cstime = utime = stime = 0;
-	cgtime = gtime = 0;
 
 	if (lock_task_sighand(task, &flags)) {
-		struct signal_struct *sig = task->signal;
-
 		if (sig->tty) {
 			struct pid *pgrp = tty_get_pgrp(sig->tty);
 			tty_pgrp = pid_nr_ns(pgrp, ns);
@@ -527,28 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		num_threads = get_nr_threads(task);
 		collect_sigign_sigcatch(task, &sigign, &sigcatch);
 
-		cmin_flt = sig->cmin_flt;
-		cmaj_flt = sig->cmaj_flt;
-		cutime = sig->cutime;
-		cstime = sig->cstime;
-		cgtime = sig->cgtime;
 		rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
 
-		/* add up live thread stats at the group level */
 		if (whole) {
-			struct task_struct *t;
-
-			__for_each_thread(sig, t) {
-				min_flt += t->min_flt;
-				maj_flt += t->maj_flt;
-				gtime += task_gtime(t);
-			}
-
-			min_flt += sig->min_flt;
-			maj_flt += sig->maj_flt;
-			thread_group_cputime_adjusted(task, &utime, &stime);
-			gtime += sig->gtime;
-
 			if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
 				exit_code = sig->group_exit_code;
 		}
@@ -562,10 +539,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	if (permitted && (!whole || num_threads < 2))
 		wchan = !task_is_running(task);
-	if (!whole) {
+
+	do {
+		seq++; /* 2 on the 1st/lockless path, otherwise odd */
+		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+
+		cmin_flt = sig->cmin_flt;
+		cmaj_flt = sig->cmaj_flt;
+		cutime = sig->cutime;
+		cstime = sig->cstime;
+		cgtime = sig->cgtime;
+
+		if (whole) {
+			struct task_struct *t;
+
+			min_flt = sig->min_flt;
+			maj_flt = sig->maj_flt;
+			gtime = sig->gtime;
+
+			rcu_read_lock();
+			__for_each_thread(sig, t) {
+				min_flt += t->min_flt;
+				maj_flt += t->maj_flt;
+				gtime += task_gtime(t);
+			}
+			rcu_read_unlock();
+		}
+	} while (need_seqretry(&sig->stats_lock, seq));
+	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+
+	if (whole) {
+		thread_group_cputime_adjusted(task, &utime, &stime);
+	} else {
+		task_cputime_adjusted(task, &utime, &stime);
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
-		task_cputime_adjusted(task, &utime, &stime);
 		gtime = task_gtime(task);
 	}
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 98a031ac2648..18550c071d71 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1878,8 +1878,6 @@ void proc_pid_evict_inode(struct proc_inode *ei)
 		hlist_del_init_rcu(&ei->sibling_inodes);
 		spin_unlock(&pid->lock);
 	}
-
-	put_pid(pid);
 }
 
 struct inode *proc_pid_make_inode(struct super_block *sb,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b33e490e3fd9..dcd513dccf55 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -30,7 +30,6 @@
 
 static void proc_evict_inode(struct inode *inode)
 {
-	struct proc_dir_entry *de;
 	struct ctl_table_header *head;
 	struct proc_inode *ei = PROC_I(inode);
 
@@ -38,17 +37,8 @@ static void proc_evict_inode(struct inode *inode)
 	clear_inode(inode);
 
 	/* Stop tracking associated processes */
-	if (ei->pid) {
+	if (ei->pid)
 		proc_pid_evict_inode(ei);
-		ei->pid = NULL;
-	}
-
-	/* Let go of any associated proc directory entry */
-	de = ei->pde;
-	if (de) {
-		pde_put(de);
-		ei->pde = NULL;
-	}
 
 	head = ei->sysctl;
 	if (head) {
@@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 
 static void proc_free_inode(struct inode *inode)
 {
+	struct proc_inode *ei = PROC_I(inode);
+
+	if (ei->pid)
+		put_pid(ei->pid);
+	/* Let go of any associated proc directory entry */
+	if (ei->pde)
+		pde_put(ei->pde);
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
@@ -95,7 +92,7 @@ void __init proc_init_kmemcache(void)
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+						SLAB_ACCOUNT|
 						SLAB_PANIC),
 					     init_once);
 	pde_opener_cache =
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b55dbc70287b..06a297a27ba3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -271,7 +271,7 @@ static void proc_kill_sb(struct super_block *sb)
 
 	kill_anon_super(sb);
 	put_pid_ns(fs_info->pid_ns);
-	kfree(fs_info);
+	kfree_rcu(fs_info, rcu);
 }
 
 static struct file_system_type proc_fs_type = {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 6eb9bb369b57..7b5711f76709 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -21,6 +21,7 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
+#include <linux/fs_context.h>
 #include "qnx4.h"
 
 #define QNX4_VERSION  4
@@ -30,28 +31,33 @@ static const struct super_operations qnx4_sops;
 
 static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_free_inode(struct inode *inode);
-static int qnx4_remount(struct super_block *sb, int *flags, char *data);
 static int qnx4_statfs(struct dentry *, struct kstatfs *);
+static int qnx4_get_tree(struct fs_context *fc);
 
 static const struct super_operations qnx4_sops =
 {
 	.alloc_inode	= qnx4_alloc_inode,
 	.free_inode	= qnx4_free_inode,
 	.statfs		= qnx4_statfs,
-	.remount_fs	= qnx4_remount,
 };
 
-static int qnx4_remount(struct super_block *sb, int *flags, char *data)
+static int qnx4_reconfigure(struct fs_context *fc)
 {
+	struct super_block *sb = fc->root->d_sb;
 	struct qnx4_sb_info *qs;
 
 	sync_filesystem(sb);
 	qs = qnx4_sb(sb);
 	qs->Version = QNX4_VERSION;
-	*flags |= SB_RDONLY;
+	fc->sb_flags |= SB_RDONLY;
 	return 0;
 }
 
+static const struct fs_context_operations qnx4_context_opts = {
+	.get_tree	= qnx4_get_tree,
+	.reconfigure	= qnx4_reconfigure,
+};
+
 static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create )
 {
 	unsigned long phys;
@@ -183,12 +189,13 @@ static const char *qnx4_checkroot(struct super_block *sb,
 	return "bitmap file not found.";
 }
 
-static int qnx4_fill_super(struct super_block *s, void *data, int silent)
+static int qnx4_fill_super(struct super_block *s, struct fs_context *fc)
 {
 	struct buffer_head *bh;
 	struct inode *root;
 	const char *errmsg;
 	struct qnx4_sb_info *qs;
+	int silent = fc->sb_flags & SB_SILENT;
 
 	qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
 	if (!qs)
@@ -216,7 +223,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data);
 	brelse(bh);
 	if (errmsg != NULL) {
- 		if (!silent)
+		if (!silent)
 			printk(KERN_ERR "qnx4: %s\n", errmsg);
 		return -EINVAL;
 	}
@@ -235,6 +242,18 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	return 0;
 }
 
+static int qnx4_get_tree(struct fs_context *fc)
+{
+	return get_tree_bdev(fc, qnx4_fill_super);
+}
+
+static int qnx4_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &qnx4_context_opts;
+
+	return 0;
+}
+
 static void qnx4_kill_sb(struct super_block *sb)
 {
 	struct qnx4_sb_info *qs = qnx4_sb(sb);
@@ -376,18 +395,12 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(qnx4_inode_cachep);
 }
 
-static struct dentry *qnx4_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
-}
-
 static struct file_system_type qnx4_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "qnx4",
-	.mount		= qnx4_mount,
-	.kill_sb	= qnx4_kill_sb,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.owner			= THIS_MODULE,
+	.name			= "qnx4",
+	.kill_sb		= qnx4_kill_sb,
+	.fs_flags		= FS_REQUIRES_DEV,
+	.init_fs_context	= qnx4_init_fs_context,
 };
 MODULE_ALIAS_FS("qnx4");
 
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index a286c545717f..405913f4faff 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -615,7 +615,7 @@ static int init_inodecache(void)
 	qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
 					     sizeof(struct qnx6_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (!qnx6_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 67b5510beded..2cc469d481a2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -670,7 +670,6 @@ static int __init init_inodecache(void)
 						  sizeof(struct
 							 reiserfs_inode_info),
 						  0, (SLAB_RECLAIM_ACCOUNT|
-						      SLAB_MEM_SPREAD|
 						      SLAB_ACCOUNT),
 						  init_once);
 	if (reiserfs_inode_cachep == NULL)
diff --git a/fs/remap_range.c b/fs/remap_range.c
index f8c1120b8311..de07f978ce3e 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -373,9 +373,9 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
-loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
-			   struct file *file_out, loff_t pos_out,
-			   loff_t len, unsigned int remap_flags)
+loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+			    struct file *file_out, loff_t pos_out,
+			    loff_t len, unsigned int remap_flags)
 {
 	loff_t ret;
 
@@ -391,23 +391,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
 	if (!file_in->f_op->remap_file_range)
 		return -EOPNOTSUPP;
 
-	ret = file_in->f_op->remap_file_range(file_in, pos_in,
-			file_out, pos_out, len, remap_flags);
-	if (ret < 0)
-		return ret;
-
-	fsnotify_access(file_in);
-	fsnotify_modify(file_out);
-	return ret;
-}
-EXPORT_SYMBOL(do_clone_file_range);
-
-loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-			    struct file *file_out, loff_t pos_out,
-			    loff_t len, unsigned int remap_flags)
-{
-	loff_t ret;
-
 	ret = remap_verify_area(file_in, pos_in, len, false);
 	if (ret)
 		return ret;
@@ -417,10 +400,14 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
 		return ret;
 
 	file_start_write(file_out);
-	ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
-				  remap_flags);
+	ret = file_in->f_op->remap_file_range(file_in, pos_in,
+			file_out, pos_out, len, remap_flags);
 	file_end_write(file_out);
+	if (ret < 0)
+		return ret;
 
+	fsnotify_access(file_in);
+	fsnotify_modify(file_out);
 	return ret;
 }
 EXPORT_SYMBOL(vfs_clone_file_range);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 545ad44f96b8..85641c3f5782 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -630,8 +630,8 @@ static int __init init_romfs_fs(void)
 	romfs_inode_cachep =
 		kmem_cache_create("romfs_i",
 				  sizeof(struct romfs_inode_info), 0,
-				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
-				  SLAB_ACCOUNT, romfs_i_init_once);
+				  SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+				  romfs_i_init_once);
 
 	if (!romfs_inode_cachep) {
 		pr_err("Failed to initialise inode cache\n");
diff --git a/fs/select.c b/fs/select.c
index 0ee55af1a55c..9515c3fa1a03 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -476,7 +476,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
 		wait->_key |= POLLOUT_SET;
 }
 
-static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 {
 	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
@@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
 
 struct poll_list {
 	struct poll_list *next;
-	int len;
+	unsigned int len;
 	struct pollfd entries[];
 };
 
@@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 		struct timespec64 *end_time)
 {
 	struct poll_wqueues table;
-	int err = -EFAULT, fdcount, len;
+	int err = -EFAULT, fdcount;
 	/* Allocate small arguments on the stack to save memory and be
 	   faster - use long to make sure the buffer is aligned properly
 	   on 64 bit archs to avoid unaligned access */
 	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
 	struct poll_list *const head = (struct poll_list *)stack_pps;
  	struct poll_list *walk = head;
- 	unsigned long todo = nfds;
+	unsigned int todo = nfds;
+	unsigned int len;
 
 	if (nfds > rlimit(RLIMIT_NOFILE))
 		return -EINVAL;
@@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 					sizeof(struct pollfd) * walk->len))
 			goto out_fds;
 
-		todo -= walk->len;
-		if (!todo)
+		if (walk->len >= todo)
 			break;
+		todo -= walk->len;
 
 		len = min(todo, POLLFD_PER_PAGE);
 		walk = walk->next = kmalloc(struct_size(walk, entries, len),
@@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 
 	for (walk = head; walk; walk = walk->next) {
 		struct pollfd *fds = walk->entries;
-		int j;
+		unsigned int j;
 
 		for (j = walk->len; j; fds++, ufds++, j--)
 			unsafe_put_user(fds->revents, &ufds->revents, Efault);
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index 971892620504..3de5047a7ff9 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -145,21 +145,27 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
 	struct cached_fid *cfid;
 	struct cached_fids *cfids;
 	const char *npath;
+	int retries = 0, cur_sleep = 1;
 
 	if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache ||
 	    is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0))
 		return -EOPNOTSUPP;
 
 	ses = tcon->ses;
-	server = cifs_pick_channel(ses);
 	cfids = tcon->cfids;
 
-	if (!server->ops->new_lease_key)
-		return -EIO;
-
 	if (cifs_sb->root == NULL)
 		return -ENOENT;
 
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	oplock = SMB2_OPLOCK_LEVEL_II;
+	server = cifs_pick_channel(ses);
+
+	if (!server->ops->new_lease_key)
+		return -EIO;
+
 	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
 	if (!utf16_path)
 		return -ENOMEM;
@@ -236,6 +242,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
 		.desired_access =  FILE_READ_DATA | FILE_READ_ATTRIBUTES,
 		.disposition = FILE_OPEN,
 		.fid = pfid,
+		.replay = !!(retries),
 	};
 
 	rc = SMB2_open_init(tcon, server,
@@ -268,6 +275,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
 	 */
 	cfid->has_lease = true;
 
+	if (retries) {
+		smb2_set_replay(server, &rqst[0]);
+		smb2_set_replay(server, &rqst[1]);
+	}
+
 	rc = compound_send_recv(xid, ses, server,
 				flags, 2, rqst,
 				resp_buftype, rsp_iov);
@@ -367,6 +379,11 @@ out:
 		atomic_inc(&tcon->num_remote_opens);
 	}
 	kfree(utf16_path);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index ef4c2e3c9fa6..6322f0f68a17 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -572,7 +572,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 		len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
 		UniStrupr(user);
 	} else {
-		memset(user, '\0', 2);
+		*(u16 *)user = 0;
 	}
 
 	rc = crypto_shash_update(ses->server->secmech.hmacmd5,
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index cbcb98d5f2d7..fb368b191eef 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -396,7 +396,7 @@ cifs_alloc_inode(struct super_block *sb)
 	spin_lock_init(&cifs_inode->writers_lock);
 	cifs_inode->writers = 0;
 	cifs_inode->netfs.inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
-	cifs_inode->server_eof = 0;
+	cifs_inode->netfs.remote_i_size = 0;
 	cifs_inode->uniqueid = 0;
 	cifs_inode->createtime = 0;
 	cifs_inode->epoch = 0;
@@ -1169,6 +1169,9 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
 {
 	char *target_path;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	target_path = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!target_path)
 		return ERR_PTR(-ENOMEM);
@@ -1377,6 +1380,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
 	struct inode *src_inode = file_inode(src_file);
 	struct inode *target_inode = file_inode(dst_file);
 	struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
+	struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode);
 	struct cifsFileInfo *smb_file_src;
 	struct cifsFileInfo *smb_file_target;
 	struct cifs_tcon *src_tcon;
@@ -1425,7 +1429,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
 	 * Advance the EOF marker after the flush above to the end of the range
 	 * if it's short of that.
 	 */
-	if (src_cifsi->server_eof < off + len) {
+	if (src_cifsi->netfs.remote_i_size < off + len) {
 		rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
 		if (rc < 0)
 			goto unlock;
@@ -1449,12 +1453,22 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
 	/* Discard all the folios that overlap the destination region. */
 	truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
 
+	fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
+			   i_size_read(target_inode), 0);
+
 	rc = file_modified(dst_file);
 	if (!rc) {
 		rc = target_tcon->ses->server->ops->copychunk_range(xid,
 			smb_file_src, smb_file_target, off, len, destoff);
-		if (rc > 0 && destoff + rc > i_size_read(target_inode))
+		if (rc > 0 && destoff + rc > i_size_read(target_inode)) {
 			truncate_setsize(target_inode, destoff + rc);
+			netfs_resize_file(&target_cifsi->netfs,
+					  i_size_read(target_inode), true);
+			fscache_resize_cookie(cifs_inode_cookie(target_inode),
+					      i_size_read(target_inode));
+		}
+		if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point)
+			target_cifsi->netfs.zero_point = destoff + rc;
 	}
 
 	file_accessed(src_file);
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 20036fb16cec..53c75cfb33ab 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -50,6 +50,11 @@
 #define CIFS_DEF_ACTIMEO (1 * HZ)
 
 /*
+ * max sleep time before retry to server
+ */
+#define CIFS_MAX_SLEEP 2000
+
+/*
  * max attribute cache timeout (jiffies) - 2^30
  */
 #define CIFS_MAX_ACTIMEO (1 << 30)
@@ -82,7 +87,7 @@
 #define SMB_INTERFACE_POLL_INTERVAL	600
 
 /* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 5
+#define MAX_COMPOUND 7
 
 /*
  * Default number of credits to keep available for SMB3.
@@ -1027,6 +1032,8 @@ struct cifs_chan {
 	__u8 signkey[SMB3_SIGN_KEY_SIZE];
 };
 
+#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1)
+
 /*
  * Session structure.  One of these for each uid session with a particular host
  */
@@ -1059,6 +1066,7 @@ struct cifs_ses {
 	enum securityEnum sectype; /* what security flavor was specified? */
 	bool sign;		/* is signing required? */
 	bool domainAuto:1;
+	unsigned int flags;
 	__u16 session_flags;
 	__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
 	__u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
@@ -1370,6 +1378,7 @@ struct cifs_open_parms {
 	struct cifs_fid *fid;
 	umode_t mode;
 	bool reconnect:1;
+	bool replay:1; /* indicates that this open is for a replay */
 };
 
 struct cifs_fid {
@@ -1501,6 +1510,7 @@ struct cifs_writedata {
 	struct smbd_mr			*mr;
 #endif
 	struct cifs_credits		credits;
+	bool				replay;
 };
 
 /*
@@ -1561,7 +1571,6 @@ struct cifsInodeInfo {
 	spinlock_t writers_lock;
 	unsigned int writers;		/* Number of writers on this inode */
 	unsigned long time;		/* jiffies of last update of inode */
-	u64  server_eof;		/* current file size on server -- protected by i_lock */
 	u64  uniqueid;			/* server inode number */
 	u64  createtime;		/* creation time on server */
 	__u8 lease_key[SMB2_LEASE_KEY_SIZE];	/* lease key for this inode */
@@ -1831,6 +1840,13 @@ static inline bool is_retryable_error(int error)
 	return false;
 }
 
+static inline bool is_replayable_error(int error)
+{
+	if (error == -EAGAIN || error == -ECONNABORTED)
+		return true;
+	return false;
+}
+
 
 /* cifs_get_writable_file() flags */
 #define FIND_WR_ANY         0
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index bfd568f89710..ac9595504f4b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -233,6 +233,12 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
 	list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
 		/* check if iface is still active */
 		spin_lock(&ses->chan_lock);
+		if (cifs_ses_get_chan_index(ses, server) ==
+		    CIFS_INVAL_CHAN_INDEX) {
+			spin_unlock(&ses->chan_lock);
+			continue;
+		}
+
 		if (!cifs_chan_is_iface_active(ses, server)) {
 			spin_unlock(&ses->chan_lock);
 			cifs_chan_update_iface(ses, server);
@@ -3438,8 +3444,18 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)
 	 * the user on mount
 	 */
 	if ((cifs_sb->ctx->wsize == 0) ||
-	    (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx)))
-		cifs_sb->ctx->wsize = server->ops->negotiate_wsize(tcon, ctx);
+	    (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) {
+		cifs_sb->ctx->wsize =
+			round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE);
+		/*
+		 * in the very unlikely event that the server sent a max write size under PAGE_SIZE,
+		 * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096
+		 */
+		if (cifs_sb->ctx->wsize == 0) {
+			cifs_sb->ctx->wsize = PAGE_SIZE;
+			cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n");
+		}
+	}
 	if ((cifs_sb->ctx->rsize == 0) ||
 	    (cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx)))
 		cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx);
@@ -4228,6 +4244,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
 
 	/* only send once per connect */
 	spin_lock(&tcon->tc_lock);
+
+	/* if tcon is marked for needing reconnect, update state */
+	if (tcon->need_reconnect)
+		tcon->status = TID_NEED_TCON;
+
 	if (tcon->status == TID_GOOD) {
 		spin_unlock(&tcon->tc_lock);
 		return 0;
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index a8a1d386da65..449c59830039 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -565,6 +565,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
 
 	/* only send once per connect */
 	spin_lock(&tcon->tc_lock);
+
+	/* if tcon is marked for needing reconnect, update state */
+	if (tcon->need_reconnect)
+		tcon->status = TID_NEED_TCON;
+
 	if (tcon->status == TID_GOOD) {
 		spin_unlock(&tcon->tc_lock);
 		return 0;
@@ -625,8 +630,8 @@ out:
 		spin_lock(&tcon->tc_lock);
 		if (tcon->status == TID_IN_TCON)
 			tcon->status = TID_GOOD;
-		spin_unlock(&tcon->tc_lock);
 		tcon->need_reconnect = false;
+		spin_unlock(&tcon->tc_lock);
 	}
 
 	return rc;
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index ee8b264fbc31..c3b8e7091a4d 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len
 			continue;
 		if (!folio_test_writeback(folio)) {
 			WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
-				  len, start, folio_index(folio), end);
+				  len, start, folio->index, end);
 			continue;
 		}
 
@@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len
 			continue;
 		if (!folio_test_writeback(folio)) {
 			WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
-				  len, start, folio_index(folio), end);
+				  len, start, folio->index, end);
 			continue;
 		}
 
@@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le
 	xas_for_each(&xas, folio, end) {
 		if (!folio_test_writeback(folio)) {
 			WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
-				  len, start, folio_index(folio), end);
+				  len, start, folio->index, end);
 			continue;
 		}
 
@@ -175,6 +175,9 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
 
 	/* only send once per connect */
 	spin_lock(&tcon->tc_lock);
+	if (tcon->need_reconnect)
+		tcon->status = TID_NEED_RECON;
+
 	if (tcon->status != TID_NEED_RECON) {
 		spin_unlock(&tcon->tc_lock);
 		return;
@@ -2124,8 +2127,8 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 {
 	loff_t end_of_write = offset + bytes_written;
 
-	if (end_of_write > cifsi->server_eof)
-		cifsi->server_eof = end_of_write;
+	if (end_of_write > cifsi->netfs.remote_i_size)
+		netfs_resize_file(&cifsi->netfs, end_of_write, true);
 }
 
 static ssize_t
@@ -2655,7 +2658,7 @@ static void cifs_extend_writeback(struct address_space *mapping,
 				continue;
 			if (xa_is_value(folio))
 				break;
-			if (folio_index(folio) != index)
+			if (folio->index != index)
 				break;
 			if (!folio_try_get_rcu(folio)) {
 				xas_reset(&xas);
@@ -2903,7 +2906,7 @@ redo_folio:
 					goto skip_write;
 			}
 
-			if (folio_mapping(folio) != mapping ||
+			if (folio->mapping != mapping ||
 			    !folio_test_dirty(folio)) {
 				start += folio_size(folio);
 				folio_unlock(folio);
@@ -3251,8 +3254,8 @@ cifs_uncached_writev_complete(struct work_struct *work)
 
 	spin_lock(&inode->i_lock);
 	cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
-	if (cifsi->server_eof > inode->i_size)
-		i_size_write(inode, cifsi->server_eof);
+	if (cifsi->netfs.remote_i_size > inode->i_size)
+		i_size_write(inode, cifsi->netfs.remote_i_size);
 	spin_unlock(&inode->i_lock);
 
 	complete(&wdata->done);
@@ -3304,6 +3307,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
 			if (wdata->cfile->invalidHandle)
 				rc = -EAGAIN;
 			else {
+				wdata->replay = true;
 #ifdef CONFIG_CIFS_SMB_DIRECT
 				if (wdata->mr) {
 					wdata->mr->need_invalidate = true;
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 52cbef2eeb28..4b2f5aa2ea0e 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -211,7 +211,7 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c
 
 	switch (match_token(value, cifs_secflavor_tokens, args)) {
 	case Opt_sec_krb5p:
-		cifs_errorf(fc, "sec=krb5p is not supported!\n");
+		cifs_errorf(fc, "sec=krb5p is not supported. Use sec=krb5,seal instead\n");
 		return 1;
 	case Opt_sec_krb5i:
 		ctx->sign = true;
@@ -1111,6 +1111,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 	case Opt_wsize:
 		ctx->wsize = result.uint_32;
 		ctx->got_wsize = true;
+		if (ctx->wsize % PAGE_SIZE != 0) {
+			ctx->wsize = round_down(ctx->wsize, PAGE_SIZE);
+			if (ctx->wsize == 0) {
+				ctx->wsize = PAGE_SIZE;
+				cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE);
+			} else {
+				cifs_dbg(VFS,
+					 "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n",
+					 ctx->wsize, PAGE_SIZE);
+			}
+		}
 		break;
 	case Opt_acregmax:
 		ctx->acregmax = HZ * result.uint_32;
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index f0989484f2c6..d02f8ba29cb5 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -104,7 +104,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 	fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
 	mtime = inode_get_mtime(inode);
 	if (timespec64_equal(&mtime, &fattr->cf_mtime) &&
-	    cifs_i->server_eof == fattr->cf_eof) {
+	    cifs_i->netfs.remote_i_size == fattr->cf_eof) {
 		cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
 			 __func__, cifs_i->uniqueid);
 		return;
@@ -194,7 +194,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	else
 		clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
 
-	cifs_i->server_eof = fattr->cf_eof;
+	cifs_i->netfs.remote_i_size = fattr->cf_eof;
 	/*
 	 * Can't safely change the file size here if the client is writing to
 	 * it due to potential races.
@@ -2858,7 +2858,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 
 set_size_out:
 	if (rc == 0) {
-		cifsInode->server_eof = attrs->ia_size;
+		netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
 		cifs_setsize(inode, attrs->ia_size);
 		/*
 		 * i_blocks is not related to (i_size / i_blksize), but instead
@@ -3011,6 +3011,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	if ((attrs->ia_valid & ATTR_SIZE) &&
 	    attrs->ia_size != i_size_read(inode)) {
 		truncate_setsize(inode, attrs->ia_size);
+		netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
 		fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
 	}
 
@@ -3210,6 +3211,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	if ((attrs->ia_valid & ATTR_SIZE) &&
 	    attrs->ia_size != i_size_read(inode)) {
 		truncate_setsize(inode, attrs->ia_size);
+		netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
 		fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
 	}
 
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index a6968573b775..4a517b280f2b 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -168,6 +168,21 @@ static char *automount_fullpath(struct dentry *dentry, void *page)
 	return s;
 }
 
+static void fs_context_set_ids(struct smb3_fs_context *ctx)
+{
+	kuid_t uid = current_fsuid();
+	kgid_t gid = current_fsgid();
+
+	if (ctx->multiuser) {
+		if (!ctx->uid_specified)
+			ctx->linux_uid = uid;
+		if (!ctx->gid_specified)
+			ctx->linux_gid = gid;
+	}
+	if (!ctx->cruid_specified)
+		ctx->cred_uid = uid;
+}
+
 /*
  * Create a vfsmount that we can automount
  */
@@ -205,6 +220,7 @@ static struct vfsmount *cifs_do_automount(struct path *path)
 	tmp.leaf_fullpath = NULL;
 	tmp.UNC = tmp.prepath = NULL;
 	tmp.dfs_root_ses = NULL;
+	fs_context_set_ids(&tmp);
 
 	rc = smb3_fs_context_dup(ctx, &tmp);
 	if (rc) {
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 94255401b38d..b520eea7bfce 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -141,7 +141,7 @@ retry:
 					if (likely(reparse_inode_match(inode, fattr))) {
 						fattr->cf_mode = inode->i_mode;
 						fattr->cf_rdev = inode->i_rdev;
-						fattr->cf_eof = CIFS_I(inode)->server_eof;
+						fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size;
 						fattr->cf_symlink_target = NULL;
 					} else {
 						CIFS_I(inode)->time = 0;
@@ -307,14 +307,16 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
 }
 
 static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr,
-				       SEARCH_ID_FULL_DIR_INFO *info,
+				       const void *info,
 				       struct cifs_sb_info *cifs_sb)
 {
+	const FILE_FULL_DIRECTORY_INFO *di = info;
+
 	__dir_info_to_fattr(fattr, info);
 
-	/* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */
+	/* See MS-FSCC 2.4.14, 2.4.19 */
 	if (fattr->cf_cifsattrs & ATTR_REPARSE)
-		fattr->cf_cifstag = le32_to_cpu(info->EaSize);
+		fattr->cf_cifstag = le32_to_cpu(di->EaSize);
 	cifs_fill_common_info(fattr, cifs_sb);
 }
 
@@ -396,7 +398,7 @@ ffirst_retry:
 	} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
 	} else /* not srvinos - BB fixme add check for backlevel? */ {
-		cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO;
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO;
 	}
 
 	search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
@@ -987,10 +989,9 @@ static int cifs_filldir(char *find_entry, struct file *file,
 				       (FIND_FILE_STANDARD_INFO *)find_entry,
 				       cifs_sb);
 		break;
+	case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
 	case SMB_FIND_FILE_ID_FULL_DIR_INFO:
-		cifs_fulldir_info_to_fattr(&fattr,
-					   (SEARCH_ID_FULL_DIR_INFO *)find_entry,
-					   cifs_sb);
+		cifs_fulldir_info_to_fattr(&fattr, find_entry, cifs_sb);
 		break;
 	default:
 		cifs_dir_info_to_fattr(&fattr,
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index cde81042bebd..8f37373fd333 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -75,6 +75,10 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
 {
 	unsigned int i;
 
+	/* if the channel is waiting for termination */
+	if (server && server->terminate)
+		return CIFS_INVAL_CHAN_INDEX;
+
 	for (i = 0; i < ses->chan_count; i++) {
 		if (ses->chans[i].server == server)
 			return i;
@@ -84,7 +88,6 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
 	if (server)
 		cifs_dbg(VFS, "unable to get chan index for server: 0x%llx",
 			 server->conn_id);
-	WARN_ON(1);
 	return CIFS_INVAL_CHAN_INDEX;
 }
 
@@ -269,6 +272,8 @@ int cifs_try_adding_channels(struct cifs_ses *ses)
 					 &iface->sockaddr,
 					 rc);
 				kref_put(&iface->refcount, release_iface);
+				/* failure to add chan should increase weight */
+				iface->weight_fulfilled++;
 				continue;
 			}
 
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index a652200540c8..05818cd6d932 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -120,6 +120,14 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 	unsigned int size[2];
 	void *data[2];
 	int len;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	oplock = SMB2_OPLOCK_LEVEL_NONE;
+	num_rqst = 0;
+	server = cifs_pick_channel(ses);
 
 	vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
 	if (vars == NULL)
@@ -127,8 +135,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 	rqst = &vars->rqst[0];
 	rsp_iov = &vars->rsp_iov[0];
 
-	server = cifs_pick_channel(ses);
-
 	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
@@ -463,15 +469,24 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 	num_rqst++;
 
 	if (cfile) {
+		if (retries)
+			for (i = 1; i < num_rqst - 2; i++)
+				smb2_set_replay(server, &rqst[i]);
+
 		rc = compound_send_recv(xid, ses, server,
 					flags, num_rqst - 2,
 					&rqst[1], &resp_buftype[1],
 					&rsp_iov[1]);
-	} else
+	} else {
+		if (retries)
+			for (i = 0; i < num_rqst; i++)
+				smb2_set_replay(server, &rqst[i]);
+
 		rc = compound_send_recv(xid, ses, server,
 					flags, num_rqst,
 					rqst, resp_buftype,
 					rsp_iov);
+	}
 
 finished:
 	num_rqst = 0;
@@ -620,9 +635,6 @@ finished:
 	}
 	SMB2_close_free(&rqst[num_rqst]);
 
-	if (cfile)
-		cifsFileInfo_put(cfile);
-
 	num_cmds += 2;
 	if (out_iov && out_buftype) {
 		memcpy(out_iov, rsp_iov, num_cmds * sizeof(*out_iov));
@@ -632,7 +644,16 @@ finished:
 		for (i = 0; i < num_cmds; i++)
 			free_rsp_buf(resp_buftype[i], rsp_iov[i].iov_base);
 	}
+	num_cmds -= 2; /* correct num_cmds as there could be a retry */
 	kfree(vars);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
+	if (cfile)
+		cifsFileInfo_put(cfile);
+
 	return rc;
 }
 
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index d9553c2556a2..4695433fcf39 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -619,7 +619,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
 		goto out;
 	}
 
-	while (bytes_left >= sizeof(*p)) {
+	while (bytes_left >= (ssize_t)sizeof(*p)) {
 		memset(&tmp_iface, 0, sizeof(tmp_iface));
 		tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
 		tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
@@ -1108,7 +1108,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
 {
 	struct smb2_compound_vars *vars;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	struct smb_rqst *rqst;
 	struct kvec *rsp_iov;
 	__le16 *utf16_path = NULL;
@@ -1124,6 +1124,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
 	struct smb2_file_full_ea_info *ea = NULL;
 	struct smb2_query_info_rsp *rsp;
 	int rc, used_len = 0;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = CIFS_CP_CREATE_CLOSE_OP;
+	oplock = SMB2_OPLOCK_LEVEL_NONE;
+	server = cifs_pick_channel(ses);
 
 	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
@@ -1197,6 +1204,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
 		.disposition = FILE_OPEN,
 		.create_options = cifs_create_options(cifs_sb, 0),
 		.fid = &fid,
+		.replay = !!(retries),
 	};
 
 	rc = SMB2_open_init(tcon, server,
@@ -1244,6 +1252,12 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
 		goto sea_exit;
 	smb2_set_related(&rqst[2]);
 
+	if (retries) {
+		smb2_set_replay(server, &rqst[0]);
+		smb2_set_replay(server, &rqst[1]);
+		smb2_set_replay(server, &rqst[2]);
+	}
+
 	rc = compound_send_recv(xid, ses, server,
 				flags, 3, rqst,
 				resp_buftype, rsp_iov);
@@ -1260,6 +1274,11 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
 	kfree(vars);
 out_free_path:
 	kfree(utf16_path);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 #endif
@@ -1484,7 +1503,7 @@ smb2_ioctl_query_info(const unsigned int xid,
 	struct smb_rqst *rqst;
 	struct kvec *rsp_iov;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	char __user *arg = (char __user *)p;
 	struct smb_query_info qi;
 	struct smb_query_info __user *pqi;
@@ -1501,6 +1520,13 @@ smb2_ioctl_query_info(const unsigned int xid,
 	void *data[2];
 	int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR;
 	void (*free_req1_func)(struct smb_rqst *r);
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = CIFS_CP_CREATE_CLOSE_OP;
+	oplock = SMB2_OPLOCK_LEVEL_NONE;
+	server = cifs_pick_channel(ses);
 
 	vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
 	if (vars == NULL)
@@ -1544,6 +1570,7 @@ smb2_ioctl_query_info(const unsigned int xid,
 		.disposition = FILE_OPEN,
 		.create_options = cifs_create_options(cifs_sb, create_options),
 		.fid = &fid,
+		.replay = !!(retries),
 	};
 
 	if (qi.flags & PASSTHRU_FSCTL) {
@@ -1641,6 +1668,12 @@ smb2_ioctl_query_info(const unsigned int xid,
 		goto free_req_1;
 	smb2_set_related(&rqst[2]);
 
+	if (retries) {
+		smb2_set_replay(server, &rqst[0]);
+		smb2_set_replay(server, &rqst[1]);
+		smb2_set_replay(server, &rqst[2]);
+	}
+
 	rc = compound_send_recv(xid, ses, server,
 				flags, 3, rqst,
 				resp_buftype, rsp_iov);
@@ -1701,6 +1734,11 @@ free_output_buffer:
 	kfree(buffer);
 free_vars:
 	kfree(vars);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -2227,8 +2265,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 	struct cifs_open_parms oparms;
 	struct smb2_query_directory_rsp *qd_rsp = NULL;
 	struct smb2_create_rsp *op_rsp = NULL;
-	struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
-	int retry_count = 0;
+	struct TCP_Server_Info *server;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	oplock = SMB2_OPLOCK_LEVEL_NONE;
+	server = cifs_pick_channel(tcon->ses);
 
 	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
 	if (!utf16_path)
@@ -2253,6 +2297,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 		.disposition = FILE_OPEN,
 		.create_options = cifs_create_options(cifs_sb, 0),
 		.fid = fid,
+		.replay = !!(retries),
 	};
 
 	rc = SMB2_open_init(tcon, server,
@@ -2278,14 +2323,15 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 
 	smb2_set_related(&rqst[1]);
 
-again:
+	if (retries) {
+		smb2_set_replay(server, &rqst[0]);
+		smb2_set_replay(server, &rqst[1]);
+	}
+
 	rc = compound_send_recv(xid, tcon->ses, server,
 				flags, 2, rqst,
 				resp_buftype, rsp_iov);
 
-	if (rc == -EAGAIN && retry_count++ < 10)
-		goto again;
-
 	/* If the open failed there is nothing to do */
 	op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
 	if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) {
@@ -2333,6 +2379,11 @@ again:
 	SMB2_query_directory_free(&rqst[1]);
 	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
 	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -2458,6 +2509,22 @@ smb2_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid,
 }
 
 void
+smb2_set_replay(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+	struct smb2_hdr *shdr;
+
+	if (server->dialect < SMB30_PROT_ID)
+		return;
+
+	shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
+	if (shdr == NULL) {
+		cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
+		return;
+	}
+	shdr->Flags |= SMB2_FLAGS_REPLAY_OPERATION;
+}
+
+void
 smb2_set_related(struct smb_rqst *rqst)
 {
 	struct smb2_hdr *shdr;
@@ -2530,6 +2597,27 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
 }
 
 /*
+ * helper function for exponential backoff and check if replayable
+ */
+bool smb2_should_replay(struct cifs_tcon *tcon,
+				int *pretries,
+				int *pcur_sleep)
+{
+	if (!pretries || !pcur_sleep)
+		return false;
+
+	if (tcon->retry || (*pretries)++ < tcon->ses->server->retrans) {
+		msleep(*pcur_sleep);
+		(*pcur_sleep) = ((*pcur_sleep) << 1);
+		if ((*pcur_sleep) > CIFS_MAX_SLEEP)
+			(*pcur_sleep) = CIFS_MAX_SLEEP;
+		return true;
+	}
+
+	return false;
+}
+
+/*
  * Passes the query info response back to the caller on success.
  * Caller need to free this with free_rsp_buf().
  */
@@ -2542,7 +2630,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
 {
 	struct smb2_compound_vars *vars;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	int flags = CIFS_CP_CREATE_CLOSE_OP;
 	struct smb_rqst *rqst;
 	int resp_buftype[3];
@@ -2553,6 +2641,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc;
 	__le16 *utf16_path;
 	struct cached_fid *cfid = NULL;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = CIFS_CP_CREATE_CLOSE_OP;
+	oplock = SMB2_OPLOCK_LEVEL_NONE;
+	server = cifs_pick_channel(ses);
 
 	if (!path)
 		path = "";
@@ -2589,6 +2684,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
 		.disposition = FILE_OPEN,
 		.create_options = cifs_create_options(cifs_sb, 0),
 		.fid = &fid,
+		.replay = !!(retries),
 	};
 
 	rc = SMB2_open_init(tcon, server,
@@ -2633,6 +2729,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
 		goto qic_exit;
 	smb2_set_related(&rqst[2]);
 
+	if (retries) {
+		if (!cfid) {
+			smb2_set_replay(server, &rqst[0]);
+			smb2_set_replay(server, &rqst[2]);
+		}
+		smb2_set_replay(server, &rqst[1]);
+	}
+
 	if (cfid) {
 		rc = compound_send_recv(xid, ses, server,
 					flags, 1, &rqst[1],
@@ -2665,6 +2769,11 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
 	kfree(vars);
 out_free_path:
 	kfree(utf16_path);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -3213,6 +3322,9 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
 				  cfile->fid.volatile_fid, cfile->pid, new_size);
 		if (rc >= 0) {
 			truncate_setsize(inode, new_size);
+			netfs_resize_file(&cifsi->netfs, new_size, true);
+			if (offset < cifsi->netfs.zero_point)
+				cifsi->netfs.zero_point = offset;
 			fscache_resize_cookie(cifs_inode_cookie(inode), new_size);
 		}
 	}
@@ -3436,7 +3548,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
 		rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
 				  cfile->fid.volatile_fid, cfile->pid, new_eof);
 		if (rc == 0) {
-			cifsi->server_eof = new_eof;
+			netfs_resize_file(&cifsi->netfs, new_eof, true);
 			cifs_setsize(inode, new_eof);
 			cifs_truncate_page(inode->i_mapping, inode->i_size);
 			truncate_setsize(inode, new_eof);
@@ -3528,8 +3640,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
 	int rc;
 	unsigned int xid;
 	struct inode *inode = file_inode(file);
-	struct cifsFileInfo *cfile = file->private_data;
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifsFileInfo *cfile = file->private_data;
+	struct netfs_inode *ictx = &cifsi->netfs;
 	loff_t old_eof, new_eof;
 
 	xid = get_xid();
@@ -3549,6 +3662,7 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
 		goto out_2;
 
 	truncate_pagecache_range(inode, off, old_eof);
+	ictx->zero_point = old_eof;
 
 	rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
 				  old_eof - off - len, off);
@@ -3563,9 +3677,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
 
 	rc = 0;
 
-	cifsi->server_eof = i_size_read(inode) - len;
-	truncate_setsize(inode, cifsi->server_eof);
-	fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
+	truncate_setsize(inode, new_eof);
+	netfs_resize_file(&cifsi->netfs, new_eof, true);
+	ictx->zero_point = new_eof;
+	fscache_resize_cookie(cifs_inode_cookie(inode), new_eof);
 out_2:
 	filemap_invalidate_unlock(inode->i_mapping);
  out:
@@ -3581,6 +3696,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
 	unsigned int xid;
 	struct cifsFileInfo *cfile = file->private_data;
 	struct inode *inode = file_inode(file);
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	__u64 count, old_eof, new_eof;
 
 	xid = get_xid();
@@ -3608,6 +3724,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
 		goto out_2;
 
 	truncate_setsize(inode, new_eof);
+	netfs_resize_file(&cifsi->netfs, i_size_read(inode), true);
 	fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode));
 
 	rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
@@ -5100,7 +5217,7 @@ static int smb2_create_reparse_symlink(const unsigned int xid,
 	struct inode *new;
 	struct kvec iov;
 	__le16 *path;
-	char *sym;
+	char *sym, sep = CIFS_DIR_SEP(cifs_sb);
 	u16 len, plen;
 	int rc = 0;
 
@@ -5114,7 +5231,8 @@ static int smb2_create_reparse_symlink(const unsigned int xid,
 		.symlink_target = sym,
 	};
 
-	path = cifs_convert_path_to_utf16(symname, cifs_sb);
+	convert_delimiter(sym, sep);
+	path = cifs_convert_path_to_utf16(sym, cifs_sb);
 	if (!path) {
 		rc = -ENOMEM;
 		goto out;
@@ -5137,7 +5255,10 @@ static int smb2_create_reparse_symlink(const unsigned int xid,
 	buf->PrintNameLength = cpu_to_le16(plen);
 	memcpy(buf->PathBuffer, path, plen);
 	buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
+	if (*sym != sep)
+		buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
 
+	convert_delimiter(sym, '/');
 	iov.iov_base = buf;
 	iov.iov_len = len;
 	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 288199f0b987..608ee05491e2 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -178,6 +178,7 @@ cifs_chan_skip_or_disable(struct cifs_ses *ses,
 		}
 
 		ses->chans[chan_index].server = NULL;
+		server->terminate = true;
 		spin_unlock(&ses->chan_lock);
 
 		/*
@@ -188,14 +189,12 @@ cifs_chan_skip_or_disable(struct cifs_ses *ses,
 		 */
 		cifs_put_tcp_session(server, from_reconnect);
 
-		server->terminate = true;
 		cifs_signal_cifsd_for_reconnect(server, false);
 
 		/* mark primary server as needing reconnect */
 		pserver = server->primary_server;
 		cifs_signal_cifsd_for_reconnect(pserver, false);
 skip_terminate:
-		mutex_unlock(&ses->session_mutex);
 		return -EHOSTDOWN;
 	}
 
@@ -400,6 +399,15 @@ skip_sess_setup:
 		goto out;
 	}
 
+	spin_lock(&ses->ses_lock);
+	if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) {
+		spin_unlock(&ses->ses_lock);
+		mutex_unlock(&ses->session_mutex);
+		goto skip_add_channels;
+	}
+	ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS;
+	spin_unlock(&ses->ses_lock);
+
 	if (!rc &&
 	    (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
 		mutex_unlock(&ses->session_mutex);
@@ -411,7 +419,7 @@ skip_sess_setup:
 		rc = SMB3_request_interfaces(xid, tcon, false);
 		free_xid(xid);
 
-		if (rc == -EOPNOTSUPP) {
+		if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
 			/*
 			 * some servers like Azure SMB server do not advertise
 			 * that multichannel has been disabled with server
@@ -429,17 +437,22 @@ skip_sess_setup:
 		if (ses->chan_max > ses->chan_count &&
 		    ses->iface_count &&
 		    !SERVER_IS_CHAN(server)) {
-			if (ses->chan_count == 1)
+			if (ses->chan_count == 1) {
 				cifs_server_dbg(VFS, "supports multichannel now\n");
+				queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+						 (SMB_INTERFACE_POLL_INTERVAL * HZ));
+			}
 
 			cifs_try_adding_channels(ses);
-			queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
-					   (SMB_INTERFACE_POLL_INTERVAL * HZ));
 		}
 	} else {
 		mutex_unlock(&ses->session_mutex);
 	}
+
 skip_add_channels:
+	spin_lock(&ses->ses_lock);
+	ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS;
+	spin_unlock(&ses->ses_lock);
 
 	if (smb2_command != SMB2_INTERNAL_CMD)
 		mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
@@ -2391,8 +2404,13 @@ create_durable_v2_buf(struct cifs_open_parms *oparms)
 	 */
 	buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
 	buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
-	generate_random_uuid(buf->dcontext.CreateGuid);
-	memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+
+	/* for replay, we should not overwrite the existing create guid */
+	if (!oparms->replay) {
+		generate_random_uuid(buf->dcontext.CreateGuid);
+		memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+	} else
+		memcpy(buf->dcontext.CreateGuid, pfid->create_guid, 16);
 
 	/* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
 	buf->Name[0] = 'D';
@@ -2765,7 +2783,14 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
 	int flags = 0;
 	unsigned int total_len;
 	__le16 *utf16_path = NULL;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	n_iov = 2;
+	server = cifs_pick_channel(ses);
 
 	cifs_dbg(FYI, "mkdir\n");
 
@@ -2869,6 +2894,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
 	/* no need to inc num_remote_opens because we close it just below */
 	trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, full_path, CREATE_NOT_FILE,
 				    FILE_WRITE_ATTRIBUTES);
+
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	/* resource #4: response buffer */
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
@@ -2906,6 +2935,11 @@ err_free_req:
 	cifs_small_buf_release(req);
 err_free_path:
 	kfree(utf16_path);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -3101,12 +3135,19 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	struct smb2_create_rsp *rsp = NULL;
 	struct cifs_tcon *tcon = oparms->tcon;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	struct kvec iov[SMB2_CREATE_IOV_SIZE];
 	struct kvec rsp_iov = {NULL, 0};
 	int resp_buftype = CIFS_NO_BUFFER;
 	int rc = 0;
 	int flags = 0;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	server = cifs_pick_channel(ses);
+	oparms->replay = !!(retries);
 
 	cifs_dbg(FYI, "create/open\n");
 	if (!ses || !server)
@@ -3128,6 +3169,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, oparms->path,
 		oparms->create_options, oparms->desired_access);
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags,
 			    &rsp_iov);
@@ -3181,6 +3225,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 creat_exit:
 	SMB2_open_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -3305,15 +3354,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	int resp_buftype = CIFS_NO_BUFFER;
 	int rc = 0;
 	int flags = 0;
-
-	cifs_dbg(FYI, "SMB2 IOCTL\n");
-
-	if (out_data != NULL)
-		*out_data = NULL;
-
-	/* zero out returned data len, in case of error */
-	if (plen)
-		*plen = 0;
+	int retries = 0, cur_sleep = 1;
 
 	if (!tcon)
 		return -EIO;
@@ -3322,10 +3363,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	if (!ses)
 		return -EIO;
 
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
 	server = cifs_pick_channel(ses);
+
 	if (!server)
 		return -EIO;
 
+	cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+	if (out_data != NULL)
+		*out_data = NULL;
+
+	/* zero out returned data len, in case of error */
+	if (plen)
+		*plen = 0;
+
 	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
@@ -3340,6 +3394,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	if (rc)
 		goto ioctl_exit;
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags,
 			    &rsp_iov);
@@ -3409,6 +3466,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 ioctl_exit:
 	SMB2_ioctl_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -3480,13 +3542,20 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	struct smb_rqst rqst;
 	struct smb2_close_rsp *rsp = NULL;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	struct kvec iov[1];
 	struct kvec rsp_iov;
 	int resp_buftype = CIFS_NO_BUFFER;
 	int rc = 0;
 	int flags = 0;
 	bool query_attrs = false;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	query_attrs = false;
+	server = cifs_pick_channel(ses);
 
 	cifs_dbg(FYI, "Close\n");
 
@@ -3512,6 +3581,9 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		goto close_exit;
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
 	rsp = (struct smb2_close_rsp *)rsp_iov.iov_base;
@@ -3545,6 +3617,11 @@ close_exit:
 			cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n",
 				 persistent_fid, tmp_rc);
 	}
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -3675,12 +3752,19 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 	struct TCP_Server_Info *server;
 	int flags = 0;
 	bool allocated = false;
+	int retries = 0, cur_sleep = 1;
 
 	cifs_dbg(FYI, "Query Info\n");
 
 	if (!ses)
 		return -EIO;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	allocated = false;
 	server = cifs_pick_channel(ses);
+
 	if (!server)
 		return -EIO;
 
@@ -3702,6 +3786,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 	trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid,
 				    ses->Suid, info_class, (__u32)info_type);
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
 	rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
@@ -3744,6 +3831,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 qinf_exit:
 	SMB2_query_info_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -3844,7 +3936,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
 		u32 *plen /* returned data len */)
 {
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	struct smb_rqst rqst;
 	struct smb2_change_notify_rsp *smb_rsp;
 	struct kvec iov[1];
@@ -3852,6 +3944,12 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
 	int resp_buftype = CIFS_NO_BUFFER;
 	int flags = 0;
 	int rc = 0;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	server = cifs_pick_channel(ses);
 
 	cifs_dbg(FYI, "change notify\n");
 	if (!ses || !server)
@@ -3876,6 +3974,10 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
 
 	trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid,
 				(u8)watch_tree, completion_filter);
+
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
 
@@ -3910,6 +4012,11 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rqst.rq_iov)
 		cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */
 	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -4152,10 +4259,16 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	struct smb_rqst rqst;
 	struct kvec iov[1];
 	struct kvec rsp_iov = {NULL, 0};
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	int resp_buftype = CIFS_NO_BUFFER;
 	int flags = 0;
 	int rc = 0;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	server = cifs_pick_channel(ses);
 
 	cifs_dbg(FYI, "flush\n");
 	if (!ses || !(ses->server))
@@ -4175,6 +4288,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		goto flush_exit;
 
 	trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid);
+
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
 
@@ -4189,6 +4306,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
  flush_exit:
 	SMB2_flush_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -4668,7 +4790,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
 	struct cifs_io_parms *io_parms = NULL;
 	int credit_request;
 
-	if (!wdata->server)
+	if (!wdata->server || wdata->replay)
 		server = wdata->server = cifs_pick_channel(tcon->ses);
 
 	/*
@@ -4753,6 +4875,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
 	rqst.rq_nvec = 1;
 	rqst.rq_iter = wdata->iter;
 	rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter);
+	if (wdata->replay)
+		smb2_set_replay(server, &rqst);
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	if (wdata->mr)
 		iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1);
@@ -4826,18 +4950,21 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 	int flags = 0;
 	unsigned int total_len;
 	struct TCP_Server_Info *server;
+	int retries = 0, cur_sleep = 1;
 
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
 	*nbytes = 0;
-
-	if (n_vec < 1)
-		return rc;
-
 	if (!io_parms->server)
 		io_parms->server = cifs_pick_channel(io_parms->tcon->ses);
 	server = io_parms->server;
 	if (server == NULL)
 		return -ECONNABORTED;
 
+	if (n_vec < 1)
+		return rc;
+
 	rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, server,
 				 (void **) &req, &total_len);
 	if (rc)
@@ -4871,6 +4998,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = n_vec + 1;
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, io_parms->tcon->ses, server,
 			    &rqst,
 			    &resp_buftype, flags, &rsp_iov);
@@ -4895,6 +5025,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 
 	cifs_small_buf_release(req);
 	free_rsp_buf(resp_buftype, rsp);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(io_parms->tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -5077,6 +5212,9 @@ int SMB2_query_directory_init(const unsigned int xid,
 	case SMB_FIND_FILE_POSIX_INFO:
 		req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO;
 		break;
+	case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
+		req->FileInformationClass = FILE_FULL_DIRECTORY_INFORMATION;
+		break;
 	default:
 		cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
 			info_level);
@@ -5146,6 +5284,9 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
 		/* note that posix payload are variable size */
 		info_buf_size = sizeof(struct smb2_posix_info);
 		break;
+	case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
+		info_buf_size = sizeof(FILE_FULL_DIRECTORY_INFO);
+		break;
 	default:
 		cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
 			 srch_inf->info_level);
@@ -5206,8 +5347,14 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 	struct kvec rsp_iov;
 	int rc = 0;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	int flags = 0;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	server = cifs_pick_channel(ses);
 
 	if (!ses || !(ses->server))
 		return -EIO;
@@ -5227,6 +5374,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		goto qdir_exit;
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
 	rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -5261,6 +5411,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 qdir_exit:
 	SMB2_query_directory_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -5327,8 +5482,14 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	int resp_buftype;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	int flags = 0;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	server = cifs_pick_channel(ses);
 
 	if (!ses || !server)
 		return -EIO;
@@ -5356,6 +5517,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 		return rc;
 	}
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
 
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags,
@@ -5371,6 +5534,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 
 	free_rsp_buf(resp_buftype, rsp);
 	kfree(iov);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -5423,12 +5591,18 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc;
 	struct smb2_oplock_break *req = NULL;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	int flags = CIFS_OBREAK_OP;
 	unsigned int total_len;
 	struct kvec iov[1];
 	struct kvec rsp_iov;
 	int resp_buf_type;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = CIFS_OBREAK_OP;
+	server = cifs_pick_channel(ses);
 
 	cifs_dbg(FYI, "SMB2_oplock_break\n");
 	rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server,
@@ -5453,15 +5627,21 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 1;
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buf_type, flags, &rsp_iov);
 	cifs_small_buf_release(req);
-
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
 		cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc);
 	}
 
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -5547,9 +5727,15 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	int resp_buftype;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	FILE_SYSTEM_POSIX_INFO *info = NULL;
 	int flags = 0;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	server = cifs_pick_channel(ses);
 
 	rc = build_qfs_info_req(&iov, tcon, server,
 				FS_POSIX_INFORMATION,
@@ -5565,6 +5751,9 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
 	rqst.rq_iov = &iov;
 	rqst.rq_nvec = 1;
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
 	free_qfs_info_req(&iov);
@@ -5584,6 +5773,11 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
 
 posix_qfsinf_exit:
 	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -5598,9 +5792,15 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	int resp_buftype;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	struct smb2_fs_full_size_info *info = NULL;
 	int flags = 0;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	server = cifs_pick_channel(ses);
 
 	rc = build_qfs_info_req(&iov, tcon, server,
 				FS_FULL_SIZE_INFORMATION,
@@ -5616,6 +5816,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	rqst.rq_iov = &iov;
 	rqst.rq_nvec = 1;
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
 	free_qfs_info_req(&iov);
@@ -5635,6 +5838,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 
 qfsinf_exit:
 	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -5649,9 +5857,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	int resp_buftype, max_len, min_len;
 	struct cifs_ses *ses = tcon->ses;
-	struct TCP_Server_Info *server = cifs_pick_channel(ses);
+	struct TCP_Server_Info *server;
 	unsigned int rsp_len, offset;
 	int flags = 0;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = 0;
+	server = cifs_pick_channel(ses);
 
 	if (level == FS_DEVICE_INFORMATION) {
 		max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
@@ -5683,6 +5897,9 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	rqst.rq_iov = &iov;
 	rqst.rq_nvec = 1;
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
 	free_qfs_info_req(&iov);
@@ -5720,6 +5937,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 
 qfsattr_exit:
 	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
@@ -5737,7 +5959,13 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	unsigned int count;
 	int flags = CIFS_NO_RSP_BUF;
 	unsigned int total_len;
-	struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
+	struct TCP_Server_Info *server;
+	int retries = 0, cur_sleep = 1;
+
+replay_again:
+	/* reinitialize for possible replay */
+	flags = CIFS_NO_RSP_BUF;
+	server = cifs_pick_channel(tcon->ses);
 
 	cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
 
@@ -5768,6 +5996,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 2;
 
+	if (retries)
+		smb2_set_replay(server, &rqst);
+
 	rc = cifs_send_recv(xid, tcon->ses, server,
 			    &rqst, &resp_buf_type, flags,
 			    &rsp_iov);
@@ -5779,6 +6010,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 				    tcon->ses->Suid, rc);
 	}
 
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto replay_again;
+
 	return rc;
 }
 
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 0034b537b0b3..b3069911e9dd 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -122,6 +122,11 @@ extern unsigned long smb_rqst_len(struct TCP_Server_Info *server,
 extern void smb2_set_next_command(struct cifs_tcon *tcon,
 				  struct smb_rqst *rqst);
 extern void smb2_set_related(struct smb_rqst *rqst);
+extern void smb2_set_replay(struct TCP_Server_Info *server,
+			    struct smb_rqst *rqst);
+extern bool smb2_should_replay(struct cifs_tcon *tcon,
+			  int *pretries,
+			  int *pcur_sleep);
 
 /*
  * SMB2 Worker functions - most of protocol specific implementation details
diff --git a/fs/smb/client/smbencrypt.c b/fs/smb/client/smbencrypt.c
index f0ce26414f17..1d1ee9f18f37 100644
--- a/fs/smb/client/smbencrypt.c
+++ b/fs/smb/client/smbencrypt.c
@@ -26,13 +26,6 @@
 #include "cifsproto.h"
 #include "../common/md4.h"
 
-#ifndef false
-#define false 0
-#endif
-#ifndef true
-#define true 1
-#endif
-
 /* following came from the other byteorder.h to avoid include conflicts */
 #define CVAL(buf,pos) (((unsigned char *)(buf))[pos])
 #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 4f717ad7c21b..994d70193432 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -400,10 +400,17 @@ unmask:
 						  server->conn_id, server->hostname);
 	}
 smbd_done:
-	if (rc < 0 && rc != -EINTR)
+	/*
+	 * there's hardly any use for the layers above to know the
+	 * actual error code here. All they should do at this point is
+	 * to retry the connection and hope it goes away.
+	 */
+	if (rc < 0 && rc != -EINTR && rc != -EAGAIN) {
 		cifs_server_dbg(VFS, "Error %d sending data on socket to server\n",
 			 rc);
-	else if (rc > 0)
+		rc = -ECONNABORTED;
+		cifs_signal_cifsd_for_reconnect(server, false);
+	} else if (rc > 0)
 		rc = 0;
 out:
 	cifs_in_send_dec(server);
@@ -428,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 	if (!(flags & CIFS_TRANSFORM_REQ))
 		return __smb_send_rqst(server, num_rqst, rqst);
 
-	if (num_rqst > MAX_COMPOUND - 1)
-		return -ENOMEM;
+	if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1))
+		return -EIO;
 
 	if (!server->ops->init_transform_rq) {
 		cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n");
@@ -1026,6 +1033,9 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
 		if (!server || server->terminate)
 			continue;
 
+		if (CIFS_CHAN_NEEDS_RECONNECT(ses, i))
+			continue;
+
 		/*
 		 * strictly speaking, we should pick up req_lock to read
 		 * server->in_flight. But it shouldn't matter much here if we
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index b7521e41402e..0ebf91ffa236 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -304,7 +304,8 @@ enum ksmbd_event {
 	KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST,
 	KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE	= 15,
 
-	KSMBD_EVENT_MAX
+	__KSMBD_EVENT_MAX,
+	KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1
 };
 
 /*
diff --git a/fs/smb/server/misc.c b/fs/smb/server/misc.c
index 9e8afaa686e3..1a5faa6f6e7b 100644
--- a/fs/smb/server/misc.c
+++ b/fs/smb/server/misc.c
@@ -261,6 +261,7 @@ out_ascii:
 
 /**
  * ksmbd_extract_sharename() - get share name from tree connect request
+ * @um: pointer to a unicode_map structure for character encoding handling
  * @treename:	buffer containing tree name and share name
  *
  * Return:      share name on success, otherwise error
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index bec0a846a8d5..089527a8b4ff 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -6173,8 +6173,10 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
 		err = ksmbd_iov_pin_rsp_read(work, (void *)rsp,
 					     offsetof(struct smb2_read_rsp, Buffer),
 					     aux_payload_buf, nbytes);
-		if (err)
+		if (err) {
+			kvfree(aux_payload_buf);
 			goto out;
+		}
 		kvfree(rpc_resp);
 	} else {
 		err = ksmbd_iov_pin_rsp(work, (void *)rsp,
@@ -6384,8 +6386,10 @@ int smb2_read(struct ksmbd_work *work)
 	err = ksmbd_iov_pin_rsp_read(work, (void *)rsp,
 				     offsetof(struct smb2_read_rsp, Buffer),
 				     aux_payload_buf, nbytes);
-	if (err)
+	if (err) {
+		kvfree(aux_payload_buf);
 		goto out;
+	}
 	ksmbd_fd_put(work, fp);
 	return 0;
 
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index b49d47bdafc9..f29bb03f0dc4 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -74,7 +74,7 @@ static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info)
 static int handle_generic_event(struct sk_buff *skb, struct genl_info *info);
 static int ksmbd_ipc_heartbeat_request(void);
 
-static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = {
+static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = {
 	[KSMBD_EVENT_UNSPEC] = {
 		.len = 0,
 	},
@@ -403,7 +403,7 @@ static int handle_generic_event(struct sk_buff *skb, struct genl_info *info)
 		return -EPERM;
 #endif
 
-	if (type >= KSMBD_EVENT_MAX) {
+	if (type > KSMBD_EVENT_MAX) {
 		WARN_ON(1);
 		return -EINVAL;
 	}
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 9d4222154dcc..002a3f0dc7c5 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -365,6 +365,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig,
  * @t:		TCP transport instance
  * @buf:	buffer to store read data from socket
  * @to_read:	number of bytes to read from socket
+ * @max_retries: number of retries if reading from socket fails
  *
  * Return:	on success return number of bytes read from socket,
  *		otherwise return error number
@@ -416,6 +417,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket)
 
 /**
  * create_socket - create socket for ksmbd/0
+ * @iface:      interface to bind the created socket to
  *
  * Return:	0 on success, error number otherwise
  */
diff --git a/fs/super.c b/fs/super.c
index d35e85295489..d6efeba0d0ce 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -274,9 +274,10 @@ static void destroy_super_work(struct work_struct *work)
 {
 	struct super_block *s = container_of(work, struct super_block,
 							destroy_work);
-	int i;
-
-	for (i = 0; i < SB_FREEZE_LEVELS; i++)
+	security_sb_free(s);
+	put_user_ns(s->s_user_ns);
+	kfree(s->s_subtype);
+	for (int i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_free_rwsem(&s->s_writers.rw_sem[i]);
 	kfree(s);
 }
@@ -296,9 +297,6 @@ static void destroy_unused_super(struct super_block *s)
 	super_unlock_excl(s);
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
-	security_sb_free(s);
-	put_user_ns(s->s_user_ns);
-	kfree(s->s_subtype);
 	shrinker_free(s->s_shrink);
 	/* no delays needed */
 	destroy_super_work(&s->destroy_work);
@@ -409,9 +407,6 @@ static void __put_super(struct super_block *s)
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(!list_empty(&s->s_mounts));
-		security_sb_free(s);
-		put_user_ns(s->s_user_ns);
-		kfree(s->s_subtype);
 		call_rcu(&s->rcu, destroy_super_rcu);
 	}
 }
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 5a915b2e68f5..76bc2d5e75a9 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -336,7 +336,7 @@ int __init sysv_init_icache(void)
 {
 	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
 			sizeof(struct sysv_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
 			init_once);
 	if (!sysv_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 410ab2a44d2f..19bcb51a2203 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh)
 	return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
 }
 
-/*
- * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
- */
 static Indirect *get_branch(struct inode *inode,
 			    int depth,
 			    int offsets[],
@@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode,
 		bh = sb_bread(sb, block);
 		if (!bh)
 			goto failure;
+		read_lock(&pointers_lock);
 		if (!verify_chain(chain, p))
 			goto changed;
 		add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+		read_unlock(&pointers_lock);
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 
 changed:
+	read_unlock(&pointers_lock);
 	brelse(bh);
 	*err = -EAGAIN;
 	goto no_block;
@@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b
 		goto out;
 
 reread:
-	read_lock(&pointers_lock);
 	partial = get_branch(inode, depth, offsets, chain, &err);
-	read_unlock(&pointers_lock);
 
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
@@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode,
 	*top = 0;
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
+	partial = get_branch(inode, k, offsets, chain, &err);
 
 	write_lock(&pointers_lock);
-	partial = get_branch(inode, k, offsets, chain, &err);
 	if (!partial)
 		partial = chain + k-1;
 	/*
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 6795fda2af19..110e8a272189 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -34,7 +34,15 @@ static DEFINE_MUTEX(eventfs_mutex);
 
 /* Choose something "unique" ;-) */
 #define EVENTFS_FILE_INODE_INO		0x12c4e37
-#define EVENTFS_DIR_INODE_INO		0x134b2f5
+
+/* Just try to make something consistent and unique */
+static int eventfs_dir_ino(struct eventfs_inode *ei)
+{
+	if (!ei->ino)
+		ei->ino = get_next_ino();
+
+	return ei->ino;
+}
 
 /*
  * The eventfs_inode (ei) itself is protected by SRCU. It is released from
@@ -54,6 +62,46 @@ enum {
 
 #define EVENTFS_MODE_MASK	(EVENTFS_SAVE_MODE - 1)
 
+/*
+ * eventfs_inode reference count management.
+ *
+ * NOTE! We count only references from dentries, in the
+ * form 'dentry->d_fsdata'. There are also references from
+ * directory inodes ('ti->private'), but the dentry reference
+ * count is always a superset of the inode reference count.
+ */
+static void release_ei(struct kref *ref)
+{
+	struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref);
+
+	WARN_ON_ONCE(!ei->is_freed);
+
+	kfree(ei->entry_attrs);
+	kfree_const(ei->name);
+	kfree_rcu(ei, rcu);
+}
+
+static inline void put_ei(struct eventfs_inode *ei)
+{
+	if (ei)
+		kref_put(&ei->kref, release_ei);
+}
+
+static inline void free_ei(struct eventfs_inode *ei)
+{
+	if (ei) {
+		ei->is_freed = 1;
+		put_ei(ei);
+	}
+}
+
+static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei)
+{
+	if (ei)
+		kref_get(&ei->kref);
+	return ei;
+}
+
 static struct dentry *eventfs_root_lookup(struct inode *dir,
 					  struct dentry *dentry,
 					  unsigned int flags);
@@ -148,33 +196,30 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
 	return ret;
 }
 
-static void update_top_events_attr(struct eventfs_inode *ei, struct dentry *dentry)
+static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb)
 {
-	struct inode *inode;
+	struct inode *root;
 
 	/* Only update if the "events" was on the top level */
 	if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL))
 		return;
 
 	/* Get the tracefs root inode. */
-	inode = d_inode(dentry->d_sb->s_root);
-	ei->attr.uid = inode->i_uid;
-	ei->attr.gid = inode->i_gid;
+	root = d_inode(sb->s_root);
+	ei->attr.uid = root->i_uid;
+	ei->attr.gid = root->i_gid;
 }
 
 static void set_top_events_ownership(struct inode *inode)
 {
 	struct tracefs_inode *ti = get_tracefs(inode);
 	struct eventfs_inode *ei = ti->private;
-	struct dentry *dentry;
 
 	/* The top events directory doesn't get automatically updated */
 	if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL))
 		return;
 
-	dentry = ei->dentry;
-
-	update_top_events_attr(ei, dentry);
+	update_top_events_attr(ei, inode->i_sb);
 
 	if (!(ei->attr.mode & EVENTFS_SAVE_UID))
 		inode->i_uid = ei->attr.uid;
@@ -225,10 +270,11 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
 {
 	struct eventfs_inode *ei;
 
-	mutex_lock(&eventfs_mutex);
 	do {
-		/* The parent always has an ei, except for events itself */
-		ei = dentry->d_parent->d_fsdata;
+		// The parent is stable because we do not do renames
+		dentry = dentry->d_parent;
+		// ... and directories always have d_fsdata
+		ei = dentry->d_fsdata;
 
 		/*
 		 * If the ei is being freed, the ownership of the children
@@ -238,12 +284,10 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
 			ei = NULL;
 			break;
 		}
-
-		dentry = ei->dentry;
+		// Walk upwards until you find the events inode
 	} while (!ei->is_events);
-	mutex_unlock(&eventfs_mutex);
 
-	update_top_events_attr(ei, dentry);
+	update_top_events_attr(ei, dentry->d_sb);
 
 	return ei;
 }
@@ -273,50 +317,11 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode,
 		inode->i_gid = attr->gid;
 }
 
-static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level)
-{
-	struct eventfs_inode *ei_child;
-
-	/* at most we have events/system/event */
-	if (WARN_ON_ONCE(level > 3))
-		return;
-
-	ei->attr.gid = gid;
-
-	if (ei->entry_attrs) {
-		for (int i = 0; i < ei->nr_entries; i++) {
-			ei->entry_attrs[i].gid = gid;
-		}
-	}
-
-	/*
-	 * Only eventfs_inode with dentries are updated, make sure
-	 * all eventfs_inodes are updated. If one of the children
-	 * do not have a dentry, this function must traverse it.
-	 */
-	list_for_each_entry_srcu(ei_child, &ei->children, list,
-				 srcu_read_lock_held(&eventfs_srcu)) {
-		if (!ei_child->dentry)
-			update_gid(ei_child, gid, level + 1);
-	}
-}
-
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
-{
-	struct eventfs_inode *ei = dentry->d_fsdata;
-	int idx;
-
-	idx = srcu_read_lock(&eventfs_srcu);
-	update_gid(ei, gid, 0);
-	srcu_read_unlock(&eventfs_srcu, idx);
-}
-
 /**
- * create_file - create a file in the tracefs filesystem
- * @name: the name of the file to create.
+ * lookup_file - look up a file in the tracefs filesystem
+ * @dentry: the dentry to look up
  * @mode: the permission that the file should have.
  * @attr: saved attributes changed by user
- * @parent: parent dentry for this file.
  * @data: something that the caller will want to get to later on.
  * @fop: struct file_operations that should be used for this file.
  *
@@ -324,30 +329,25 @@ void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
  * directory. The inode.i_private pointer will point to @data in the open()
  * call.
  */
-static struct dentry *create_file(const char *name, umode_t mode,
+static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
+				  struct dentry *dentry,
+				  umode_t mode,
 				  struct eventfs_attr *attr,
-				  struct dentry *parent, void *data,
+				  void *data,
 				  const struct file_operations *fop)
 {
 	struct tracefs_inode *ti;
-	struct dentry *dentry;
 	struct inode *inode;
 
 	if (!(mode & S_IFMT))
 		mode |= S_IFREG;
 
 	if (WARN_ON_ONCE(!S_ISREG(mode)))
-		return NULL;
-
-	WARN_ON_ONCE(!parent);
-	dentry = eventfs_start_creating(name, parent);
-
-	if (IS_ERR(dentry))
-		return dentry;
+		return ERR_PTR(-EIO);
 
 	inode = tracefs_get_inode(dentry->d_sb);
 	if (unlikely(!inode))
-		return eventfs_failed_creating(dentry);
+		return ERR_PTR(-ENOMEM);
 
 	/* If the user updated the directory's attributes, use them */
 	update_inode_attr(dentry, inode, attr, mode);
@@ -361,32 +361,31 @@ static struct dentry *create_file(const char *name, umode_t mode,
 
 	ti = get_tracefs(inode);
 	ti->flags |= TRACEFS_EVENT_INODE;
-	d_instantiate(dentry, inode);
-	fsnotify_create(dentry->d_parent->d_inode, dentry);
-	return eventfs_end_creating(dentry);
+
+	// Files have their parent's ei as their fsdata
+	dentry->d_fsdata = get_ei(parent_ei);
+
+	d_add(dentry, inode);
+	return NULL;
 };
 
 /**
- * create_dir - create a dir in the tracefs filesystem
+ * lookup_dir_entry - look up a dir in the tracefs filesystem
+ * @dentry: the directory to look up
  * @ei: the eventfs_inode that represents the directory to create
- * @parent: parent dentry for this file.
  *
- * This function will create a dentry for a directory represented by
+ * This function will look up a dentry for a directory represented by
  * a eventfs_inode.
  */
-static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
+static struct dentry *lookup_dir_entry(struct dentry *dentry,
+	struct eventfs_inode *pei, struct eventfs_inode *ei)
 {
 	struct tracefs_inode *ti;
-	struct dentry *dentry;
 	struct inode *inode;
 
-	dentry = eventfs_start_creating(ei->name, parent);
-	if (IS_ERR(dentry))
-		return dentry;
-
 	inode = tracefs_get_inode(dentry->d_sb);
 	if (unlikely(!inode))
-		return eventfs_failed_creating(dentry);
+		return ERR_PTR(-ENOMEM);
 
 	/* If the user updated the directory's attributes, use them */
 	update_inode_attr(dentry, inode, &ei->attr,
@@ -396,68 +395,50 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent
 	inode->i_fop = &eventfs_file_operations;
 
 	/* All directories will have the same inode number */
-	inode->i_ino = EVENTFS_DIR_INODE_INO;
+	inode->i_ino = eventfs_dir_ino(ei);
 
 	ti = get_tracefs(inode);
 	ti->flags |= TRACEFS_EVENT_INODE;
+	/* Only directories have ti->private set to an ei, not files */
+	ti->private = ei;
 
-	inc_nlink(inode);
-	d_instantiate(dentry, inode);
-	inc_nlink(dentry->d_parent->d_inode);
-	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
-	return eventfs_end_creating(dentry);
+	dentry->d_fsdata = get_ei(ei);
+
+	d_add(dentry, inode);
+	return NULL;
 }
 
-static void free_ei(struct eventfs_inode *ei)
+static inline struct eventfs_inode *alloc_ei(const char *name)
 {
-	kfree_const(ei->name);
-	kfree(ei->d_children);
-	kfree(ei->entry_attrs);
-	kfree(ei);
+	struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+
+	if (!ei)
+		return NULL;
+
+	ei->name = kstrdup_const(name, GFP_KERNEL);
+	if (!ei->name) {
+		kfree(ei);
+		return NULL;
+	}
+	kref_init(&ei->kref);
+	return ei;
 }
 
 /**
- * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode
- * @ti: the tracefs_inode of the dentry
+ * eventfs_d_release - dentry is going away
  * @dentry: dentry which has the reference to remove.
  *
  * Remove the association between a dentry from an eventfs_inode.
  */
-void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+void eventfs_d_release(struct dentry *dentry)
 {
-	struct eventfs_inode *ei;
-	int i;
-
-	mutex_lock(&eventfs_mutex);
-
-	ei = dentry->d_fsdata;
-	if (!ei)
-		goto out;
-
-	/* This could belong to one of the files of the ei */
-	if (ei->dentry != dentry) {
-		for (i = 0; i < ei->nr_entries; i++) {
-			if (ei->d_children[i] == dentry)
-				break;
-		}
-		if (WARN_ON_ONCE(i == ei->nr_entries))
-			goto out;
-		ei->d_children[i] = NULL;
-	} else if (ei->is_freed) {
-		free_ei(ei);
-	} else {
-		ei->dentry = NULL;
-	}
-
-	dentry->d_fsdata = NULL;
- out:
-	mutex_unlock(&eventfs_mutex);
+	put_ei(dentry->d_fsdata);
 }
 
 /**
- * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * lookup_file_dentry - create a dentry for a file of an eventfs_inode
  * @ei: the eventfs_inode that the file will be created under
- * @idx: the index into the d_children[] of the @ei
+ * @idx: the index into the entry_attrs[] of the @ei
  * @parent: The parent dentry of the created file.
  * @name: The name of the file to create
  * @mode: The mode of the file.
@@ -468,163 +449,17 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
  * address located at @e_dentry.
  */
 static struct dentry *
-create_file_dentry(struct eventfs_inode *ei, int idx,
-		   struct dentry *parent, const char *name, umode_t mode, void *data,
+lookup_file_dentry(struct dentry *dentry,
+		   struct eventfs_inode *ei, int idx,
+		   umode_t mode, void *data,
 		   const struct file_operations *fops)
 {
 	struct eventfs_attr *attr = NULL;
-	struct dentry **e_dentry = &ei->d_children[idx];
-	struct dentry *dentry;
-
-	WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
 
-	mutex_lock(&eventfs_mutex);
-	if (ei->is_freed) {
-		mutex_unlock(&eventfs_mutex);
-		return NULL;
-	}
-	/* If the e_dentry already has a dentry, use it */
-	if (*e_dentry) {
-		dget(*e_dentry);
-		mutex_unlock(&eventfs_mutex);
-		return *e_dentry;
-	}
-
-	/* ei->entry_attrs are protected by SRCU */
 	if (ei->entry_attrs)
 		attr = &ei->entry_attrs[idx];
 
-	mutex_unlock(&eventfs_mutex);
-
-	dentry = create_file(name, mode, attr, parent, data, fops);
-
-	mutex_lock(&eventfs_mutex);
-
-	if (IS_ERR_OR_NULL(dentry)) {
-		/*
-		 * When the mutex was released, something else could have
-		 * created the dentry for this e_dentry. In which case
-		 * use that one.
-		 *
-		 * If ei->is_freed is set, the e_dentry is currently on its
-		 * way to being freed, don't return it. If e_dentry is NULL
-		 * it means it was already freed.
-		 */
-		if (ei->is_freed) {
-			dentry = NULL;
-		} else {
-			dentry = *e_dentry;
-			dget(dentry);
-		}
-		mutex_unlock(&eventfs_mutex);
-		return dentry;
-	}
-
-	if (!*e_dentry && !ei->is_freed) {
-		*e_dentry = dentry;
-		dentry->d_fsdata = ei;
-	} else {
-		/*
-		 * Should never happen unless we get here due to being freed.
-		 * Otherwise it means two dentries exist with the same name.
-		 */
-		WARN_ON_ONCE(!ei->is_freed);
-		dentry = NULL;
-	}
-	mutex_unlock(&eventfs_mutex);
-
-	return dentry;
-}
-
-/**
- * eventfs_post_create_dir - post create dir routine
- * @ei: eventfs_inode of recently created dir
- *
- * Map the meta-data of files within an eventfs dir to their parent dentry
- */
-static void eventfs_post_create_dir(struct eventfs_inode *ei)
-{
-	struct eventfs_inode *ei_child;
-	struct tracefs_inode *ti;
-
-	lockdep_assert_held(&eventfs_mutex);
-
-	/* srcu lock already held */
-	/* fill parent-child relation */
-	list_for_each_entry_srcu(ei_child, &ei->children, list,
-				 srcu_read_lock_held(&eventfs_srcu)) {
-		ei_child->d_parent = ei->dentry;
-	}
-
-	ti = get_tracefs(ei->dentry->d_inode);
-	ti->private = ei;
-}
-
-/**
- * create_dir_dentry - Create a directory dentry for the eventfs_inode
- * @pei: The eventfs_inode parent of ei.
- * @ei: The eventfs_inode to create the directory for
- * @parent: The dentry of the parent of this directory
- *
- * This creates and attaches a directory dentry to the eventfs_inode @ei.
- */
-static struct dentry *
-create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
-		  struct dentry *parent)
-{
-	struct dentry *dentry = NULL;
-
-	WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
-
-	mutex_lock(&eventfs_mutex);
-	if (pei->is_freed || ei->is_freed) {
-		mutex_unlock(&eventfs_mutex);
-		return NULL;
-	}
-	if (ei->dentry) {
-		/* If the eventfs_inode already has a dentry, use it */
-		dentry = ei->dentry;
-		dget(dentry);
-		mutex_unlock(&eventfs_mutex);
-		return dentry;
-	}
-	mutex_unlock(&eventfs_mutex);
-
-	dentry = create_dir(ei, parent);
-
-	mutex_lock(&eventfs_mutex);
-
-	if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
-		/*
-		 * When the mutex was released, something else could have
-		 * created the dentry for this e_dentry. In which case
-		 * use that one.
-		 *
-		 * If ei->is_freed is set, the e_dentry is currently on its
-		 * way to being freed.
-		 */
-		dentry = ei->dentry;
-		if (dentry)
-			dget(dentry);
-		mutex_unlock(&eventfs_mutex);
-		return dentry;
-	}
-
-	if (!ei->dentry && !ei->is_freed) {
-		ei->dentry = dentry;
-		eventfs_post_create_dir(ei);
-		dentry->d_fsdata = ei;
-	} else {
-		/*
-		 * Should never happen unless we get here due to being freed.
-		 * Otherwise it means two dentries exist with the same name.
-		 */
-		WARN_ON_ONCE(!ei->is_freed);
-		dentry = NULL;
-	}
-	mutex_unlock(&eventfs_mutex);
-
-	return dentry;
+	return lookup_file(ei, dentry, mode, attr, data, fops);
 }
 
 /**
@@ -641,79 +476,50 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
 					  struct dentry *dentry,
 					  unsigned int flags)
 {
-	const struct file_operations *fops;
-	const struct eventfs_entry *entry;
 	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 	struct eventfs_inode *ei;
-	struct dentry *ei_dentry = NULL;
-	struct dentry *ret = NULL;
-	struct dentry *d;
 	const char *name = dentry->d_name.name;
-	umode_t mode;
-	void *data;
-	int idx;
-	int i;
-	int r;
+	struct dentry *result = NULL;
 
 	ti = get_tracefs(dir);
 	if (!(ti->flags & TRACEFS_EVENT_INODE))
-		return NULL;
-
-	/* Grab srcu to prevent the ei from going away */
-	idx = srcu_read_lock(&eventfs_srcu);
+		return ERR_PTR(-EIO);
 
-	/*
-	 * Grab the eventfs_mutex to consistent value from ti->private.
-	 * This s
-	 */
 	mutex_lock(&eventfs_mutex);
-	ei = READ_ONCE(ti->private);
-	if (ei && !ei->is_freed)
-		ei_dentry = READ_ONCE(ei->dentry);
-	mutex_unlock(&eventfs_mutex);
 
-	if (!ei || !ei_dentry)
+	ei = ti->private;
+	if (!ei || ei->is_freed)
 		goto out;
 
-	data = ei->data;
-
-	list_for_each_entry_srcu(ei_child, &ei->children, list,
-				 srcu_read_lock_held(&eventfs_srcu)) {
+	list_for_each_entry(ei_child, &ei->children, list) {
 		if (strcmp(ei_child->name, name) != 0)
 			continue;
-		ret = simple_lookup(dir, dentry, flags);
-		if (IS_ERR(ret))
+		if (ei_child->is_freed)
 			goto out;
-		d = create_dir_dentry(ei, ei_child, ei_dentry);
-		dput(d);
+		result = lookup_dir_entry(dentry, ei, ei_child);
 		goto out;
 	}
 
-	for (i = 0; i < ei->nr_entries; i++) {
-		entry = &ei->entries[i];
-		if (strcmp(name, entry->name) == 0) {
-			void *cdata = data;
-			mutex_lock(&eventfs_mutex);
-			/* If ei->is_freed, then the event itself may be too */
-			if (!ei->is_freed)
-				r = entry->callback(name, &mode, &cdata, &fops);
-			else
-				r = -1;
-			mutex_unlock(&eventfs_mutex);
-			if (r <= 0)
-				continue;
-			ret = simple_lookup(dir, dentry, flags);
-			if (IS_ERR(ret))
-				goto out;
-			d = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops);
-			dput(d);
-			break;
-		}
+	for (int i = 0; i < ei->nr_entries; i++) {
+		void *data;
+		umode_t mode;
+		const struct file_operations *fops;
+		const struct eventfs_entry *entry = &ei->entries[i];
+
+		if (strcmp(name, entry->name) != 0)
+			continue;
+
+		data = ei->data;
+		if (entry->callback(name, &mode, &data, &fops) <= 0)
+			goto out;
+
+		result = lookup_file_dentry(dentry, ei, i, mode, data, fops);
+		goto out;
 	}
  out:
-	srcu_read_unlock(&eventfs_srcu, idx);
-	return ret;
+	mutex_unlock(&eventfs_mutex);
+	return result;
 }
 
 /*
@@ -802,7 +608,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx)
 
 		name = ei_child->name;
 
-		ino = EVENTFS_DIR_INODE_INO;
+		ino = eventfs_dir_ino(ei_child);
 
 		if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR))
 			goto out_dec;
@@ -863,25 +669,10 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
 	if (!parent)
 		return ERR_PTR(-EINVAL);
 
-	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+	ei = alloc_ei(name);
 	if (!ei)
 		return ERR_PTR(-ENOMEM);
 
-	ei->name = kstrdup_const(name, GFP_KERNEL);
-	if (!ei->name) {
-		kfree(ei);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	if (size) {
-		ei->d_children = kcalloc(size, sizeof(*ei->d_children), GFP_KERNEL);
-		if (!ei->d_children) {
-			kfree_const(ei->name);
-			kfree(ei);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-
 	ei->entries = entries;
 	ei->nr_entries = size;
 	ei->data = data;
@@ -889,10 +680,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
 	INIT_LIST_HEAD(&ei->list);
 
 	mutex_lock(&eventfs_mutex);
-	if (!parent->is_freed) {
+	if (!parent->is_freed)
 		list_add_tail(&ei->list, &parent->children);
-		ei->d_parent = parent->dentry;
-	}
 	mutex_unlock(&eventfs_mutex);
 
 	/* Was the parent freed? */
@@ -932,28 +721,20 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
 	if (IS_ERR(dentry))
 		return ERR_CAST(dentry);
 
-	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+	ei = alloc_ei(name);
 	if (!ei)
-		goto fail_ei;
+		goto fail;
 
 	inode = tracefs_get_inode(dentry->d_sb);
 	if (unlikely(!inode))
 		goto fail;
 
-	if (size) {
-		ei->d_children = kcalloc(size, sizeof(*ei->d_children), GFP_KERNEL);
-		if (!ei->d_children)
-			goto fail;
-	}
-
-	ei->dentry = dentry;
+	// Note: we have a ref to the dentry from tracefs_start_creating()
+	ei->events_dir = dentry;
 	ei->entries = entries;
 	ei->nr_entries = size;
 	ei->is_events = 1;
 	ei->data = data;
-	ei->name = kstrdup_const(name, GFP_KERNEL);
-	if (!ei->name)
-		goto fail;
 
 	/* Save the ownership of this directory */
 	uid = d_inode(dentry->d_parent)->i_uid;
@@ -984,11 +765,19 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
 	inode->i_op = &eventfs_root_dir_inode_operations;
 	inode->i_fop = &eventfs_file_operations;
 
-	dentry->d_fsdata = ei;
+	dentry->d_fsdata = get_ei(ei);
 
-	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inc_nlink(inode);
+	/*
+	 * Keep all eventfs directories with i_nlink == 1.
+	 * Due to the dynamic nature of the dentry creations and not
+	 * wanting to add a pointer to the parent eventfs_inode in the
+	 * eventfs_inode structure, keeping the i_nlink in sync with the
+	 * number of directories would cause too much complexity for
+	 * something not worth much. Keeping directory links at 1
+	 * tells userspace not to trust the link number.
+	 */
 	d_instantiate(dentry, inode);
+	/* The dentry of the "events" parent does keep track though */
 	inc_nlink(dentry->d_parent->d_inode);
 	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
 	tracefs_end_creating(dentry);
@@ -996,72 +785,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
 	return ei;
 
  fail:
-	kfree(ei->d_children);
-	kfree(ei);
- fail_ei:
+	free_ei(ei);
 	tracefs_failed_creating(dentry);
 	return ERR_PTR(-ENOMEM);
 }
 
-static LLIST_HEAD(free_list);
-
-static void eventfs_workfn(struct work_struct *work)
-{
-        struct eventfs_inode *ei, *tmp;
-        struct llist_node *llnode;
-
-	llnode = llist_del_all(&free_list);
-        llist_for_each_entry_safe(ei, tmp, llnode, llist) {
-		/* This dput() matches the dget() from unhook_dentry() */
-		for (int i = 0; i < ei->nr_entries; i++) {
-			if (ei->d_children[i])
-				dput(ei->d_children[i]);
-		}
-		/* This should only get here if it had a dentry */
-		if (!WARN_ON_ONCE(!ei->dentry))
-			dput(ei->dentry);
-        }
-}
-
-static DECLARE_WORK(eventfs_work, eventfs_workfn);
-
-static void free_rcu_ei(struct rcu_head *head)
-{
-	struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu);
-
-	if (ei->dentry) {
-		/* Do not free the ei until all references of dentry are gone */
-		if (llist_add(&ei->llist, &free_list))
-			queue_work(system_unbound_wq, &eventfs_work);
-		return;
-	}
-
-	/* If the ei doesn't have a dentry, neither should its children */
-	for (int i = 0; i < ei->nr_entries; i++) {
-		WARN_ON_ONCE(ei->d_children[i]);
-	}
-
-	free_ei(ei);
-}
-
-static void unhook_dentry(struct dentry *dentry)
-{
-	if (!dentry)
-		return;
-	/*
-	 * Need to add a reference to the dentry that is expected by
-	 * simple_recursive_removal(), which will include a dput().
-	 */
-	dget(dentry);
-
-	/*
-	 * Also add a reference for the dput() in eventfs_workfn().
-	 * That is required as that dput() will free the ei after
-	 * the SRCU grace period is over.
-	 */
-	dget(dentry);
-}
-
 /**
  * eventfs_remove_rec - remove eventfs dir or file from list
  * @ei: eventfs_inode to be removed.
@@ -1074,8 +802,6 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
 {
 	struct eventfs_inode *ei_child;
 
-	if (!ei)
-		return;
 	/*
 	 * Check recursion depth. It should never be greater than 3:
 	 * 0 - events/
@@ -1087,28 +813,11 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
 		return;
 
 	/* search for nested folders or files */
-	list_for_each_entry_srcu(ei_child, &ei->children, list,
-				 lockdep_is_held(&eventfs_mutex)) {
-		/* Children only have dentry if parent does */
-		WARN_ON_ONCE(ei_child->dentry && !ei->dentry);
+	list_for_each_entry(ei_child, &ei->children, list)
 		eventfs_remove_rec(ei_child, level + 1);
-	}
-
-
-	ei->is_freed = 1;
 
-	for (int i = 0; i < ei->nr_entries; i++) {
-		if (ei->d_children[i]) {
-			/* Children only have dentry if parent does */
-			WARN_ON_ONCE(!ei->dentry);
-			unhook_dentry(ei->d_children[i]);
-		}
-	}
-
-	unhook_dentry(ei->dentry);
-
-	list_del_rcu(&ei->list);
-	call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei);
+	list_del(&ei->list);
+	free_ei(ei);
 }
 
 /**
@@ -1119,22 +828,12 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
  */
 void eventfs_remove_dir(struct eventfs_inode *ei)
 {
-	struct dentry *dentry;
-
 	if (!ei)
 		return;
 
 	mutex_lock(&eventfs_mutex);
-	dentry = ei->dentry;
 	eventfs_remove_rec(ei, 0);
 	mutex_unlock(&eventfs_mutex);
-
-	/*
-	 * If any of the ei children has a dentry, then the ei itself
-	 * must have a dentry.
-	 */
-	if (dentry)
-		simple_recursive_removal(dentry, NULL);
 }
 
 /**
@@ -1147,7 +846,11 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
 {
 	struct dentry *dentry;
 
-	dentry = ei->dentry;
+	dentry = ei->events_dir;
+	if (!dentry)
+		return;
+
+	ei->events_dir = NULL;
 	eventfs_remove_dir(ei);
 
 	/*
@@ -1157,5 +860,6 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
 	 * sticks around while the other ei->dentry are created
 	 * and destroyed dynamically.
 	 */
+	d_invalidate(dentry);
 	dput(dentry);
 }
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index e1b172c0e091..d65ffad4c327 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -38,8 +38,6 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb)
 	if (!ti)
 		return NULL;
 
-	ti->flags = 0;
-
 	return &ti->vfs_inode;
 }
 
@@ -379,21 +377,30 @@ static const struct super_operations tracefs_super_operations = {
 	.show_options	= tracefs_show_options,
 };
 
-static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
+/*
+ * It would be cleaner if eventfs had its own dentry ops.
+ *
+ * Note that d_revalidate is called potentially under RCU,
+ * so it can't take the eventfs mutex etc. It's fine - if
+ * we open a file just as it's marked dead, things will
+ * still work just fine, and just see the old stale case.
+ */
+static void tracefs_d_release(struct dentry *dentry)
 {
-	struct tracefs_inode *ti;
+	if (dentry->d_fsdata)
+		eventfs_d_release(dentry);
+}
 
-	if (!dentry || !inode)
-		return;
+static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct eventfs_inode *ei = dentry->d_fsdata;
 
-	ti = get_tracefs(inode);
-	if (ti && ti->flags & TRACEFS_EVENT_INODE)
-		eventfs_set_ei_status_free(ti, dentry);
-	iput(inode);
+	return !(ei && ei->is_freed);
 }
 
 static const struct dentry_operations tracefs_dentry_operations = {
-	.d_iput = tracefs_dentry_iput,
+	.d_revalidate = tracefs_d_revalidate,
+	.d_release = tracefs_d_release,
 };
 
 static int trace_fill_super(struct super_block *sb, void *data, int silent)
@@ -497,75 +504,6 @@ struct dentry *tracefs_end_creating(struct dentry *dentry)
 	return dentry;
 }
 
-/**
- * eventfs_start_creating - start the process of creating a dentry
- * @name: Name of the file created for the dentry
- * @parent: The parent dentry where this dentry will be created
- *
- * This is a simple helper function for the dynamically created eventfs
- * files. When the directory of the eventfs files are accessed, their
- * dentries are created on the fly. This function is used to start that
- * process.
- */
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
-{
-	struct dentry *dentry;
-	int error;
-
-	/* Must always have a parent. */
-	if (WARN_ON_ONCE(!parent))
-		return ERR_PTR(-EINVAL);
-
-	error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
-			      &tracefs_mount_count);
-	if (error)
-		return ERR_PTR(error);
-
-	if (unlikely(IS_DEADDIR(parent->d_inode)))
-		dentry = ERR_PTR(-ENOENT);
-	else
-		dentry = lookup_one_len(name, parent, strlen(name));
-
-	if (!IS_ERR(dentry) && dentry->d_inode) {
-		dput(dentry);
-		dentry = ERR_PTR(-EEXIST);
-	}
-
-	if (IS_ERR(dentry))
-		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-
-	return dentry;
-}
-
-/**
- * eventfs_failed_creating - clean up a failed eventfs dentry creation
- * @dentry: The dentry to clean up
- *
- * If after calling eventfs_start_creating(), a failure is detected, the
- * resources created by eventfs_start_creating() needs to be cleaned up. In
- * that case, this function should be called to perform that clean up.
- */
-struct dentry *eventfs_failed_creating(struct dentry *dentry)
-{
-	dput(dentry);
-	simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-	return NULL;
-}
-
-/**
- * eventfs_end_creating - Finish the process of creating a eventfs dentry
- * @dentry: The dentry that has successfully been created.
- *
- * This function is currently just a place holder to match
- * eventfs_start_creating(). In case any synchronization needs to be added,
- * this function will be used to implement that without having to modify
- * the callers of eventfs_start_creating().
- */
-struct dentry *eventfs_end_creating(struct dentry *dentry)
-{
-	return dentry;
-}
-
 /* Find the inode that this will use for default */
 static struct inode *instance_inode(struct dentry *parent, struct inode *inode)
 {
@@ -779,7 +717,11 @@ static void init_once(void *foo)
 {
 	struct tracefs_inode *ti = (struct tracefs_inode *) foo;
 
+	/* inode_init_once() calls memset() on the vfs_inode portion */
 	inode_init_once(&ti->vfs_inode);
+
+	/* Zero out the rest */
+	memset_after(ti, 0, vfs_inode);
 }
 
 static int __init tracefs_init(void)
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 12b7d0150ae9..beb3dcd0e434 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -11,9 +11,10 @@ enum {
 };
 
 struct tracefs_inode {
+	struct inode            vfs_inode;
+	/* The below gets initialized with memset_after(ti, 0, vfs_inode) */
 	unsigned long           flags;
 	void                    *private;
-	struct inode            vfs_inode;
 };
 
 /*
@@ -31,42 +32,37 @@ struct eventfs_attr {
 /*
  * struct eventfs_inode - hold the properties of the eventfs directories.
  * @list:	link list into the parent directory
+ * @rcu:	Union with @list for freeing
+ * @children:	link list into the child eventfs_inode
  * @entries:	the array of entries representing the files in the directory
  * @name:	the name of the directory to create
- * @children:	link list into the child eventfs_inode
- * @dentry:     the dentry of the directory
- * @d_parent:   pointer to the parent's dentry
- * @d_children: The array of dentries to represent the files when created
+ * @events_dir: the dentry of the events directory
  * @entry_attrs: Saved mode and ownership of the @d_children
- * @attr:	Saved mode and ownership of eventfs_inode itself
  * @data:	The private data to pass to the callbacks
+ * @attr:	Saved mode and ownership of eventfs_inode itself
  * @is_freed:	Flag set if the eventfs is on its way to be freed
  *                Note if is_freed is set, then dentry is corrupted.
+ * @is_events:	Flag set for only the top level "events" directory
  * @nr_entries: The number of items in @entries
+ * @ino:	The saved inode number
  */
 struct eventfs_inode {
-	struct list_head		list;
+	union {
+		struct list_head	list;
+		struct rcu_head		rcu;
+	};
+	struct list_head		children;
 	const struct eventfs_entry	*entries;
 	const char			*name;
-	struct list_head		children;
-	struct dentry			*dentry; /* Check is_freed to access */
-	struct dentry			*d_parent;
-	struct dentry			**d_children;
+	struct dentry			*events_dir;
 	struct eventfs_attr		*entry_attrs;
-	struct eventfs_attr		attr;
 	void				*data;
-	/*
-	 * Union - used for deletion
-	 * @llist:	for calling dput() if needed after RCU
-	 * @rcu:	eventfs_inode to delete in RCU
-	 */
-	union {
-		struct llist_node	llist;
-		struct rcu_head		rcu;
-	};
+	struct eventfs_attr		attr;
+	struct kref			kref;
 	unsigned int			is_freed:1;
 	unsigned int			is_events:1;
 	unsigned int			nr_entries:30;
+	unsigned int			ino;
 };
 
 static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
@@ -78,10 +74,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent);
 struct dentry *tracefs_end_creating(struct dentry *dentry);
 struct dentry *tracefs_failed_creating(struct dentry *dentry);
 struct inode *tracefs_get_inode(struct super_block *sb);
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
-struct dentry *eventfs_failed_creating(struct dentry *dentry);
-struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid);
-void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+
+void eventfs_d_release(struct dentry *dentry);
 
 #endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e413a9cf8ee3..551148de66cd 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -205,7 +205,6 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 	dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
 
 	err = fscrypt_prepare_lookup(dir, dentry, &nm);
-	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		return d_splice_alias(NULL, dentry);
 	if (err)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 09e270d6ed02..304646b03e99 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2239,6 +2239,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_umount;
 	}
 
+	generic_set_sb_d_ops(sb);
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		err = -ENOMEM;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 9976a00a73f9..e965a48e7db9 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -421,10 +421,10 @@ xfs_attr_complete_op(
 	bool			do_replace = args->op_flags & XFS_DA_OP_REPLACE;
 
 	args->op_flags &= ~XFS_DA_OP_REPLACE;
-	if (do_replace) {
-		args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+	args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+	if (do_replace)
 		return replace_state;
-	}
+
 	return XFS_DAS_DONE;
 }
 
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 31100120b2c5..e31663cb7b43 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1119,20 +1119,6 @@ xfs_rtbitmap_blockcount(
 }
 
 /*
- * Compute the maximum level number of the realtime summary file, as defined by
- * mkfs.  The historic use of highbit32 on a 64-bit quantity prohibited correct
- * use of rt volumes with more than 2^32 extents.
- */
-uint8_t
-xfs_compute_rextslog(
-	xfs_rtbxlen_t		rtextents)
-{
-	if (!rtextents)
-		return 0;
-	return xfs_highbit64(rtextents);
-}
-
-/*
  * Compute the number of rtbitmap words needed to populate every block of a
  * bitmap that is large enough to track the given number of rt extents.
  */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 274dc7dae1fa..152a66750af5 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -351,20 +351,6 @@ xfs_rtfree_extent(
 int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
 		xfs_filblks_t rtlen);
 
-uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
-
-/* Do we support an rt volume having this number of rtextents? */
-static inline bool
-xfs_validate_rtextents(
-	xfs_rtbxlen_t		rtextents)
-{
-	/* No runt rt volumes */
-	if (rtextents == 0)
-		return false;
-
-	return true;
-}
-
 xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
 		rtextents);
 unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
@@ -383,8 +369,6 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
 # define xfs_rtsummary_read_buf(a,b)			(-ENOSYS)
 # define xfs_rtbuf_cache_relse(a)			(0)
 # define xfs_rtalloc_extent_is_free(m,t,s,l,i)		(-ENOSYS)
-# define xfs_compute_rextslog(rtx)			(0)
-# define xfs_validate_rtextents(rtx)			(false)
 static inline xfs_filblks_t
 xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
 {
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4a9e8588f4c9..5bb6e2bd6dee 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1377,3 +1377,17 @@ xfs_validate_stripe_geometry(
 	}
 	return true;
 }
+
+/*
+ * Compute the maximum level number of the realtime summary file, as defined by
+ * mkfs.  The historic use of highbit32 on a 64-bit quantity prohibited correct
+ * use of rt volumes with more than 2^32 extents.
+ */
+uint8_t
+xfs_compute_rextslog(
+	xfs_rtbxlen_t		rtextents)
+{
+	if (!rtextents)
+		return 0;
+	return xfs_highbit64(rtextents);
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 19134b23c10b..2e8e8d63d4eb 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -38,4 +38,6 @@ extern int	xfs_sb_get_secondary(struct xfs_mount *mp,
 extern bool	xfs_validate_stripe_geometry(struct xfs_mount *mp,
 		__s64 sunit, __s64 swidth, int sectorsize, bool silent);
 
+uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+
 #endif	/* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 20b5375f2d9c..62e02d5380ad 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -251,4 +251,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
 bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
 		xfs_fileoff_t len);
 
+/* Do we support an rt volume having this number of rtextents? */
+static inline bool
+xfs_validate_rtextents(
+	xfs_rtbxlen_t		rtextents)
+{
+	/* No runt rt volumes */
+	if (rtextents == 0)
+		return false;
+
+	return true;
+}
+
 #endif	/* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 441ca9977652..46583517377f 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -15,6 +15,7 @@
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bit.h"
+#include "xfs_sb.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/repair.h"
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index fabd0ed9dfa6..b1ff4f33324a 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -16,6 +16,7 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_bit.h"
 #include "xfs_bmap.h"
+#include "xfs_sb.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 813f85156b0c..1698507d1ac7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -112,7 +112,7 @@ xfs_end_ioend(
 	 * longer dirty. If we don't remove delalloc blocks here, they become
 	 * stale and can corrupt free space accounting on unmount.
 	 */
-	error = blk_status_to_errno(ioend->io_bio->bi_status);
+	error = blk_status_to_errno(ioend->io_bio.bi_status);
 	if (unlikely(error)) {
 		if (ioend->io_flags & IOMAP_F_SHARED) {
 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
@@ -179,7 +179,7 @@ STATIC void
 xfs_end_bio(
 	struct bio		*bio)
 {
-	struct iomap_ioend	*ioend = bio->bi_private;
+	struct iomap_ioend	*ioend = iomap_ioend_from_bio(bio);
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	unsigned long		flags;
 
@@ -276,7 +276,8 @@ static int
 xfs_map_blocks(
 	struct iomap_writepage_ctx *wpc,
 	struct inode		*inode,
-	loff_t			offset)
+	loff_t			offset,
+	unsigned int		len)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -444,7 +445,7 @@ xfs_prepare_ioend(
 	/* send ioends that might require a transaction to the completion wq */
 	if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
 	    (ioend->io_flags & IOMAP_F_SHARED))
-		ioend->io_bio->bi_end_io = xfs_end_bio;
+		ioend->io_bio.bi_end_io = xfs_end_bio;
 	return status;
 }
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index aff20ddd4a9f..98401de832ee 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -350,7 +350,6 @@ xfs_setup_dax_always(
 		return -EINVAL;
 	}
 
-	xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
 	return 0;
 
 disable_dax:
@@ -1496,6 +1495,18 @@ xfs_fs_fill_super(
 
 	mp->m_super = sb;
 
+	/*
+	 * Copy VFS mount flags from the context now that all parameter parsing
+	 * is guaranteed to have been completed by either the old mount API or
+	 * the newer fsopen/fsconfig API.
+	 */
+	if (fc->sb_flags & SB_RDONLY)
+		set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+	if (fc->sb_flags & SB_DIRSYNC)
+		mp->m_features |= XFS_FEAT_DIRSYNC;
+	if (fc->sb_flags & SB_SYNCHRONOUS)
+		mp->m_features |= XFS_FEAT_WSYNC;
+
 	error = xfs_fs_validate_params(mp);
 	if (error)
 		return error;
@@ -1965,6 +1976,11 @@ static const struct fs_context_operations xfs_context_ops = {
 	.free        = xfs_fs_free,
 };
 
+/*
+ * WARNING: do not initialise any parameters in this function that depend on
+ * mount option parsing having already been performed as this can be called from
+ * fsopen() before any parameters have been set.
+ */
 static int xfs_init_fs_context(
 	struct fs_context	*fc)
 {
@@ -1996,16 +2012,6 @@ static int xfs_init_fs_context(
 	mp->m_logbsize = -1;
 	mp->m_allocsize_log = 16; /* 64k */
 
-	/*
-	 * Copy binary VFS mount flags we are interested in.
-	 */
-	if (fc->sb_flags & SB_RDONLY)
-		set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
-	if (fc->sb_flags & SB_DIRSYNC)
-		mp->m_features |= XFS_FEAT_DIRSYNC;
-	if (fc->sb_flags & SB_SYNCHRONOUS)
-		mp->m_features |= XFS_FEAT_WSYNC;
-
 	fc->s_fs_info = mp;
 	fc->ops = &xfs_context_ops;
 
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 6ab2318a9c8e..3b103715acc9 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac)
  * which implies that the page range can only be within the fixed inode size.
  */
 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
-				   struct inode *inode, loff_t offset)
+				   struct inode *inode, loff_t offset,
+				   unsigned int len)
 {
 	struct zonefs_zone *z = zonefs_inode_zone(inode);
 
@@ -348,7 +349,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
 	struct zonefs_inode_info *zi = ZONEFS_I(inode);
 
 	if (error) {
-		zonefs_io_error(inode, true);
+		/*
+		 * For Sync IOs, error recovery is called from
+		 * zonefs_file_dio_write().
+		 */
+		if (!is_sync_kiocb(iocb))
+			zonefs_io_error(inode, true);
 		return error;
 	}
 
@@ -491,6 +497,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 			ret = -EINVAL;
 			goto inode_unlock;
 		}
+		/*
+		 * Advance the zone write pointer offset. This assumes that the
+		 * IO will succeed, which is OK to do because we do not allow
+		 * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
+		 * fails, the error path will correct the write pointer offset.
+		 */
+		z->z_wpoffset += count;
+		zonefs_inode_account_active(inode);
 		mutex_unlock(&zi->i_truncate_mutex);
 	}
 
@@ -504,20 +518,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 	if (ret == -ENOTBLK)
 		ret = -EBUSY;
 
-	if (zonefs_zone_is_seq(z) &&
-	    (ret > 0 || ret == -EIOCBQUEUED)) {
-		if (ret > 0)
-			count = ret;
-
-		/*
-		 * Update the zone write pointer offset assuming the write
-		 * operation succeeded. If it did not, the error recovery path
-		 * will correct it. Also do active seq file accounting.
-		 */
-		mutex_lock(&zi->i_truncate_mutex);
-		z->z_wpoffset += count;
-		zonefs_inode_account_active(inode);
-		mutex_unlock(&zi->i_truncate_mutex);
+	/*
+	 * For a failed IO or partial completion, trigger error recovery
+	 * to update the zone write pointer offset to a correct value.
+	 * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
+	 * have executed error recovery if the IO already completed when we
+	 * reach here. However, we cannot know that and execute error recovery
+	 * again (that will not change anything).
+	 */
+	if (zonefs_zone_is_seq(z)) {
+		if (ret > 0 && ret != count)
+			ret = -EIO;
+		if (ret < 0 && ret != -EIOCBQUEUED)
+			zonefs_io_error(inode, true);
 	}
 
 inode_unlock:
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 93971742613a..b6e8e7c96251 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode)
 	z->z_mode = inode->i_mode;
 }
 
-struct zonefs_ioerr_data {
-	struct inode	*inode;
-	bool		write;
-};
-
 static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
 			      void *data)
 {
-	struct zonefs_ioerr_data *err = data;
-	struct inode *inode = err->inode;
+	struct blk_zone *z = data;
+
+	*z = *zone;
+	return 0;
+}
+
+static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone,
+				   bool write)
+{
 	struct zonefs_zone *z = zonefs_inode_zone(inode);
 	struct super_block *sb = inode->i_sb;
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
 	data_size = zonefs_check_zone_condition(sb, z, zone);
 	isize = i_size_read(inode);
 	if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
-	    !err->write && isize == data_size)
-		return 0;
+	    !write && isize == data_size)
+		return;
 
 	/*
 	 * At this point, we detected either a bad zone or an inconsistency
@@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
 	 * In all cases, warn about inode size inconsistency and handle the
 	 * IO error according to the zone condition and to the mount options.
 	 */
-	if (zonefs_zone_is_seq(z) && isize != data_size)
+	if (isize != data_size)
 		zonefs_warn(sb,
 			    "inode %lu: invalid size %lld (should be %lld)\n",
 			    inode->i_ino, isize, data_size);
@@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
 	zonefs_i_size_write(inode, data_size);
 	z->z_wpoffset = data_size;
 	zonefs_inode_account_active(inode);
-
-	return 0;
 }
 
 /*
@@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write)
 {
 	struct zonefs_zone *z = zonefs_inode_zone(inode);
 	struct super_block *sb = inode->i_sb;
-	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 	unsigned int noio_flag;
-	unsigned int nr_zones = 1;
-	struct zonefs_ioerr_data err = {
-		.inode = inode,
-		.write = write,
-	};
+	struct blk_zone zone;
 	int ret;
 
 	/*
-	 * The only files that have more than one zone are conventional zone
-	 * files with aggregated conventional zones, for which the inode zone
-	 * size is always larger than the device zone size.
+	 * Conventional zone have no write pointer and cannot become read-only
+	 * or offline. So simply fake a report for a single or aggregated zone
+	 * and let zonefs_handle_io_error() correct the zone inode information
+	 * according to the mount options.
 	 */
-	if (z->z_size > bdev_zone_sectors(sb->s_bdev))
-		nr_zones = z->z_size >>
-			(sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+	if (!zonefs_zone_is_seq(z)) {
+		zone.start = z->z_sector;
+		zone.len = z->z_size >> SECTOR_SHIFT;
+		zone.wp = zone.start + zone.len;
+		zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
+		zone.cond = BLK_ZONE_COND_NOT_WP;
+		zone.capacity = zone.len;
+		goto handle_io_error;
+	}
 
 	/*
 	 * Memory allocations in blkdev_report_zones() can trigger a memory
@@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write)
 	 * the GFP_NOIO context avoids both problems.
 	 */
 	noio_flag = memalloc_noio_save();
-	ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
-				  zonefs_io_error_cb, &err);
-	if (ret != nr_zones)
+	ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1,
+				  zonefs_io_error_cb, &zone);
+	memalloc_noio_restore(noio_flag);
+
+	if (ret != 1) {
 		zonefs_err(sb, "Get inode %lu zone information failed %d\n",
 			   inode->i_ino, ret);
-	memalloc_noio_restore(noio_flag);
+		zonefs_warn(sb, "remounting filesystem read-only\n");
+		sb->s_flags |= SB_RDONLY;
+		return;
+	}
+
+handle_io_error:
+	zonefs_handle_io_error(inode, &zone, write);
 }
 
 static struct kmem_cache *zonefs_inode_cachep;